torchcodec 0.8.0__cp311-cp311-win_amd64.whl → 0.8.1__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/AVIOTensorContext.cpp +23 -16
- torchcodec/_core/AVIOTensorContext.h +2 -1
- torchcodec/_core/BetaCudaDeviceInterface.cpp +168 -86
- torchcodec/_core/BetaCudaDeviceInterface.h +7 -5
- torchcodec/_core/CMakeLists.txt +1 -19
- torchcodec/_core/CUDACommon.cpp +21 -6
- torchcodec/_core/CUDACommon.h +6 -1
- torchcodec/_core/Cache.h +6 -20
- torchcodec/_core/CpuDeviceInterface.cpp +7 -1
- torchcodec/_core/CpuDeviceInterface.h +4 -1
- torchcodec/_core/CudaDeviceInterface.cpp +19 -11
- torchcodec/_core/CudaDeviceInterface.h +6 -1
- torchcodec/_core/DeviceInterface.h +27 -27
- torchcodec/_core/Encoder.cpp +51 -7
- torchcodec/_core/Encoder.h +12 -1
- torchcodec/_core/FFMPEGCommon.cpp +1 -1
- torchcodec/_core/FFMPEGCommon.h +9 -1
- torchcodec/_core/FilterGraph.cpp +2 -1
- torchcodec/_core/Frame.cpp +5 -0
- torchcodec/_core/Frame.h +1 -1
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +3 -13
- torchcodec/_core/NVDECCache.h +4 -6
- torchcodec/_core/SingleStreamDecoder.cpp +22 -31
- torchcodec/_core/SingleStreamDecoder.h +4 -2
- torchcodec/_core/StreamOptions.h +2 -2
- torchcodec/_core/Transform.cpp +27 -0
- torchcodec/_core/Transform.h +25 -0
- torchcodec/_core/__init__.py +3 -0
- torchcodec/_core/custom_ops.cpp +99 -22
- torchcodec/_core/ops.py +76 -16
- torchcodec/decoders/_video_decoder.py +0 -10
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +6 -4
- torchcodec-0.8.1.dist-info/RECORD +82 -0
- torchcodec-0.8.0.dist-info/RECORD +0 -80
- {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +0 -0
- {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -18,15 +18,15 @@ constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
|
|
|
18
18
|
int read(void* opaque, uint8_t* buf, int buf_size) {
|
|
19
19
|
auto tensorContext = static_cast<detail::TensorContext*>(opaque);
|
|
20
20
|
TORCH_CHECK(
|
|
21
|
-
tensorContext->
|
|
22
|
-
"Tried to read outside of the buffer:
|
|
23
|
-
tensorContext->
|
|
21
|
+
tensorContext->current_pos <= tensorContext->data.numel(),
|
|
22
|
+
"Tried to read outside of the buffer: current_pos=",
|
|
23
|
+
tensorContext->current_pos,
|
|
24
24
|
", size=",
|
|
25
25
|
tensorContext->data.numel());
|
|
26
26
|
|
|
27
27
|
int64_t numBytesRead = std::min(
|
|
28
28
|
static_cast<int64_t>(buf_size),
|
|
29
|
-
tensorContext->data.numel() - tensorContext->
|
|
29
|
+
tensorContext->data.numel() - tensorContext->current_pos);
|
|
30
30
|
|
|
31
31
|
TORCH_CHECK(
|
|
32
32
|
numBytesRead >= 0,
|
|
@@ -34,8 +34,8 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
|
|
|
34
34
|
numBytesRead,
|
|
35
35
|
", size=",
|
|
36
36
|
tensorContext->data.numel(),
|
|
37
|
-
",
|
|
38
|
-
tensorContext->
|
|
37
|
+
", current_pos=",
|
|
38
|
+
tensorContext->current_pos);
|
|
39
39
|
|
|
40
40
|
if (numBytesRead == 0) {
|
|
41
41
|
return AVERROR_EOF;
|
|
@@ -43,9 +43,9 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
|
|
|
43
43
|
|
|
44
44
|
std::memcpy(
|
|
45
45
|
buf,
|
|
46
|
-
tensorContext->data.data_ptr<uint8_t>() + tensorContext->
|
|
46
|
+
tensorContext->data.data_ptr<uint8_t>() + tensorContext->current_pos,
|
|
47
47
|
numBytesRead);
|
|
48
|
-
tensorContext->
|
|
48
|
+
tensorContext->current_pos += numBytesRead;
|
|
49
49
|
return numBytesRead;
|
|
50
50
|
}
|
|
51
51
|
|
|
@@ -54,7 +54,7 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
|
|
|
54
54
|
auto tensorContext = static_cast<detail::TensorContext*>(opaque);
|
|
55
55
|
|
|
56
56
|
int64_t bufSize = static_cast<int64_t>(buf_size);
|
|
57
|
-
if (tensorContext->
|
|
57
|
+
if (tensorContext->current_pos + bufSize > tensorContext->data.numel()) {
|
|
58
58
|
TORCH_CHECK(
|
|
59
59
|
tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE,
|
|
60
60
|
"We tried to allocate an output encoded tensor larger than ",
|
|
@@ -68,13 +68,17 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
|
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
TORCH_CHECK(
|
|
71
|
-
tensorContext->
|
|
71
|
+
tensorContext->current_pos + bufSize <= tensorContext->data.numel(),
|
|
72
72
|
"Re-allocation of the output tensor didn't work. ",
|
|
73
73
|
"This should not happen, please report on TorchCodec bug tracker");
|
|
74
74
|
|
|
75
75
|
uint8_t* outputTensorData = tensorContext->data.data_ptr<uint8_t>();
|
|
76
|
-
std::memcpy(outputTensorData + tensorContext->
|
|
77
|
-
tensorContext->
|
|
76
|
+
std::memcpy(outputTensorData + tensorContext->current_pos, buf, bufSize);
|
|
77
|
+
tensorContext->current_pos += bufSize;
|
|
78
|
+
// Track the maximum position written so getOutputTensor's narrow() does not
|
|
79
|
+
// truncate the file if final seek was backwards
|
|
80
|
+
tensorContext->max_pos =
|
|
81
|
+
std::max(tensorContext->current_pos, tensorContext->max_pos);
|
|
78
82
|
return buf_size;
|
|
79
83
|
}
|
|
80
84
|
|
|
@@ -88,7 +92,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
|
|
|
88
92
|
ret = tensorContext->data.numel();
|
|
89
93
|
break;
|
|
90
94
|
case SEEK_SET:
|
|
91
|
-
tensorContext->
|
|
95
|
+
tensorContext->current_pos = offset;
|
|
92
96
|
ret = offset;
|
|
93
97
|
break;
|
|
94
98
|
default:
|
|
@@ -101,7 +105,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
|
|
|
101
105
|
} // namespace
|
|
102
106
|
|
|
103
107
|
AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
|
|
104
|
-
: tensorContext_{data, 0} {
|
|
108
|
+
: tensorContext_{data, 0, 0} {
|
|
105
109
|
TORCH_CHECK(data.numel() > 0, "data must not be empty");
|
|
106
110
|
TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
|
|
107
111
|
TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
|
|
@@ -110,14 +114,17 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
|
|
|
110
114
|
}
|
|
111
115
|
|
|
112
116
|
AVIOToTensorContext::AVIOToTensorContext()
|
|
113
|
-
: tensorContext_{
|
|
117
|
+
: tensorContext_{
|
|
118
|
+
torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}),
|
|
119
|
+
0,
|
|
120
|
+
0} {
|
|
114
121
|
createAVIOContext(
|
|
115
122
|
nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
|
|
116
123
|
}
|
|
117
124
|
|
|
118
125
|
torch::Tensor AVIOToTensorContext::getOutputTensor() {
|
|
119
126
|
return tensorContext_.data.narrow(
|
|
120
|
-
/*dim=*/0, /*start=*/0, /*length=*/tensorContext_.
|
|
127
|
+
/*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max_pos);
|
|
121
128
|
}
|
|
122
129
|
|
|
123
130
|
} // namespace facebook::torchcodec
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
16
16
|
#include "src/torchcodec/_core/NVDECCache.h"
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
#include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
|
|
19
19
|
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
|
|
20
20
|
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
|
|
21
21
|
|
|
@@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
|
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
|
|
56
|
-
// Check decoder capabilities - same checks as DALI
|
|
57
|
-
auto caps = CUVIDDECODECAPS{};
|
|
58
|
-
caps.eCodecType = videoFormat->codec;
|
|
59
|
-
caps.eChromaFormat = videoFormat->chroma_format;
|
|
60
|
-
caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
|
|
61
|
-
CUresult result = cuvidGetDecoderCaps(&caps);
|
|
62
|
-
TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
|
|
63
|
-
|
|
64
|
-
TORCH_CHECK(
|
|
65
|
-
caps.bIsSupported,
|
|
66
|
-
"Codec configuration not supported on this GPU. "
|
|
67
|
-
"Codec: ",
|
|
68
|
-
static_cast<int>(videoFormat->codec),
|
|
69
|
-
", chroma format: ",
|
|
70
|
-
static_cast<int>(videoFormat->chroma_format),
|
|
71
|
-
", bit depth: ",
|
|
72
|
-
videoFormat->bit_depth_luma_minus8 + 8);
|
|
73
|
-
|
|
74
|
-
TORCH_CHECK(
|
|
75
|
-
videoFormat->coded_width >= caps.nMinWidth &&
|
|
76
|
-
videoFormat->coded_height >= caps.nMinHeight,
|
|
77
|
-
"Video is too small in at least one dimension. Provided: ",
|
|
78
|
-
videoFormat->coded_width,
|
|
79
|
-
"x",
|
|
80
|
-
videoFormat->coded_height,
|
|
81
|
-
" vs supported:",
|
|
82
|
-
caps.nMinWidth,
|
|
83
|
-
"x",
|
|
84
|
-
caps.nMinHeight);
|
|
85
|
-
|
|
86
|
-
TORCH_CHECK(
|
|
87
|
-
videoFormat->coded_width <= caps.nMaxWidth &&
|
|
88
|
-
videoFormat->coded_height <= caps.nMaxHeight,
|
|
89
|
-
"Video is too large in at least one dimension. Provided: ",
|
|
90
|
-
videoFormat->coded_width,
|
|
91
|
-
"x",
|
|
92
|
-
videoFormat->coded_height,
|
|
93
|
-
" vs supported:",
|
|
94
|
-
caps.nMaxWidth,
|
|
95
|
-
"x",
|
|
96
|
-
caps.nMaxHeight);
|
|
97
|
-
|
|
98
|
-
// See nMaxMBCount in cuviddec.h
|
|
99
|
-
constexpr unsigned int macroblockConstant = 256;
|
|
100
|
-
TORCH_CHECK(
|
|
101
|
-
videoFormat->coded_width * videoFormat->coded_height /
|
|
102
|
-
macroblockConstant <=
|
|
103
|
-
caps.nMaxMBCount,
|
|
104
|
-
"Video is too large (too many macroblocks). "
|
|
105
|
-
"Provided (width * height / ",
|
|
106
|
-
macroblockConstant,
|
|
107
|
-
"): ",
|
|
108
|
-
videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
|
|
109
|
-
" vs supported:",
|
|
110
|
-
caps.nMaxMBCount);
|
|
111
|
-
|
|
112
|
-
// Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
|
|
113
|
-
// sure it's actually supported.
|
|
114
|
-
TORCH_CHECK(
|
|
115
|
-
(caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
|
|
116
|
-
"NV12 output format is not supported for this configuration. ",
|
|
117
|
-
"Codec: ",
|
|
118
|
-
static_cast<int>(videoFormat->codec),
|
|
119
|
-
", chroma format: ",
|
|
120
|
-
static_cast<int>(videoFormat->chroma_format),
|
|
121
|
-
", bit depth: ",
|
|
122
|
-
videoFormat->bit_depth_luma_minus8 + 8);
|
|
123
|
-
|
|
124
56
|
// Decoder creation parameters, most are taken from DALI
|
|
125
57
|
CUVIDDECODECREATEINFO decoderParams = {};
|
|
126
58
|
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
|
|
@@ -129,7 +61,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
|
|
|
129
61
|
// automatically converted to 8bits by NVDEC itself. That is, the raw frames
|
|
130
62
|
// we get back from cuvidMapVideoFrame will already be in 8bit format. We
|
|
131
63
|
// won't need to do the conversion ourselves, so that's a lot easier.
|
|
132
|
-
// In the
|
|
64
|
+
// In the ffmpeg CUDA interface, we have to do the 10 -> 8bits conversion
|
|
133
65
|
// ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
|
|
134
66
|
// requests 10 or 16bits output formats for >8-bit videos!
|
|
135
67
|
// https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
|
|
@@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
|
|
|
157
89
|
decoderParams.display_area.bottom = videoFormat->display_area.bottom;
|
|
158
90
|
|
|
159
91
|
CUvideodecoder* decoder = new CUvideodecoder();
|
|
160
|
-
result = cuvidCreateDecoder(decoder, &decoderParams);
|
|
92
|
+
CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
|
|
161
93
|
TORCH_CHECK(
|
|
162
94
|
result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
|
|
163
95
|
return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
|
|
164
96
|
}
|
|
165
97
|
|
|
166
|
-
|
|
98
|
+
std::optional<cudaVideoChromaFormat> validateChromaSupport(
|
|
99
|
+
const AVPixFmtDescriptor* desc) {
|
|
100
|
+
// Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
|
|
101
|
+
// otherwise.
|
|
102
|
+
TORCH_CHECK(desc != nullptr, "desc can't be null");
|
|
103
|
+
|
|
104
|
+
if (desc->nb_components == 1) {
|
|
105
|
+
return cudaVideoChromaFormat_Monochrome;
|
|
106
|
+
} else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
|
|
107
|
+
// Make sure it's YUV: has chroma planes and isn't RGB
|
|
108
|
+
if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
|
|
109
|
+
return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
|
|
110
|
+
} else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
|
|
111
|
+
return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
|
|
112
|
+
} else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
|
|
113
|
+
return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return std::nullopt;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
|
|
121
|
+
// Return the corresponding cudaVideoCodec if supported, std::nullopt
|
|
122
|
+
// otherwise
|
|
123
|
+
// Note that we currently return nullopt (and thus fallback to CPU) for some
|
|
124
|
+
// codecs that are technically supported by NVDEC, see comment below.
|
|
167
125
|
switch (codecId) {
|
|
168
126
|
case AV_CODEC_ID_H264:
|
|
169
127
|
return cudaVideoCodec_H264;
|
|
@@ -189,12 +147,72 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
|
|
|
189
147
|
// return cudaVideoCodec_JPEG;
|
|
190
148
|
// case AV_CODEC_ID_VC1:
|
|
191
149
|
// return cudaVideoCodec_VC1;
|
|
192
|
-
default:
|
|
193
|
-
|
|
194
|
-
}
|
|
150
|
+
default:
|
|
151
|
+
return std::nullopt;
|
|
195
152
|
}
|
|
196
153
|
}
|
|
197
154
|
|
|
155
|
+
bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
|
|
156
|
+
// Return true iff the input video stream is supported by our NVDEC
|
|
157
|
+
// implementation.
|
|
158
|
+
|
|
159
|
+
auto codecType = validateCodecSupport(codecContext->codec_id);
|
|
160
|
+
if (!codecType.has_value()) {
|
|
161
|
+
return false;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
|
|
165
|
+
if (!desc) {
|
|
166
|
+
return false;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
auto chromaFormat = validateChromaSupport(desc);
|
|
170
|
+
if (!chromaFormat.has_value()) {
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
auto caps = CUVIDDECODECAPS{};
|
|
175
|
+
caps.eCodecType = codecType.value();
|
|
176
|
+
caps.eChromaFormat = chromaFormat.value();
|
|
177
|
+
caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
|
|
178
|
+
|
|
179
|
+
CUresult result = cuvidGetDecoderCaps(&caps);
|
|
180
|
+
if (result != CUDA_SUCCESS) {
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (!caps.bIsSupported) {
|
|
185
|
+
return false;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
|
|
189
|
+
auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
|
|
190
|
+
if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
|
|
191
|
+
coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
|
|
192
|
+
coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
|
|
193
|
+
return false;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// See nMaxMBCount in cuviddec.h
|
|
197
|
+
constexpr unsigned int macroblockConstant = 256;
|
|
198
|
+
if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
|
|
199
|
+
return false;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// We'll set the decoderParams.OutputFormat to NV12, so we need to make
|
|
203
|
+
// sure it's actually supported.
|
|
204
|
+
// TODO: If this fail, we could consider decoding to something else than NV12
|
|
205
|
+
// (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
|
|
206
|
+
// what FFmpeg does.
|
|
207
|
+
bool supportsNV12Output =
|
|
208
|
+
(caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
|
|
209
|
+
if (!supportsNV12Output) {
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return true;
|
|
214
|
+
}
|
|
215
|
+
|
|
198
216
|
} // namespace
|
|
199
217
|
|
|
200
218
|
BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
|
|
@@ -205,6 +223,8 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
|
|
|
205
223
|
|
|
206
224
|
initializeCudaContextWithPytorch(device_);
|
|
207
225
|
nppCtx_ = getNppStreamContext(device_);
|
|
226
|
+
|
|
227
|
+
nvcuvidAvailable_ = loadNVCUVIDLibrary();
|
|
208
228
|
}
|
|
209
229
|
|
|
210
230
|
BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
|
|
@@ -216,12 +236,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
|
|
|
216
236
|
// unclear.
|
|
217
237
|
flush();
|
|
218
238
|
unmapPreviousFrame();
|
|
219
|
-
NVDECCache::getCache(device_.
|
|
220
|
-
|
|
239
|
+
NVDECCache::getCache(device_).returnDecoder(
|
|
240
|
+
&videoFormat_, std::move(decoder_));
|
|
221
241
|
}
|
|
222
242
|
|
|
223
243
|
if (videoParser_) {
|
|
224
|
-
// TODONVDEC P2: consider caching this? Does DALI do that?
|
|
225
244
|
cuvidDestroyVideoParser(videoParser_);
|
|
226
245
|
videoParser_ = nullptr;
|
|
227
246
|
}
|
|
@@ -231,7 +250,21 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
|
|
|
231
250
|
|
|
232
251
|
void BetaCudaDeviceInterface::initialize(
|
|
233
252
|
const AVStream* avStream,
|
|
234
|
-
const UniqueDecodingAVFormatContext& avFormatCtx
|
|
253
|
+
const UniqueDecodingAVFormatContext& avFormatCtx,
|
|
254
|
+
[[maybe_unused]] const SharedAVCodecContext& codecContext) {
|
|
255
|
+
if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
|
|
256
|
+
cpuFallback_ = createDeviceInterface(torch::kCPU);
|
|
257
|
+
TORCH_CHECK(
|
|
258
|
+
cpuFallback_ != nullptr, "Failed to create CPU device interface");
|
|
259
|
+
cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
|
|
260
|
+
cpuFallback_->initializeVideo(
|
|
261
|
+
VideoStreamOptions(),
|
|
262
|
+
{},
|
|
263
|
+
/*resizedOutputDims=*/std::nullopt);
|
|
264
|
+
// We'll always use the CPU fallback from now on, so we can return early.
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
|
|
235
268
|
TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
|
|
236
269
|
timeBase_ = avStream->time_base;
|
|
237
270
|
frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
|
|
@@ -243,7 +276,11 @@ void BetaCudaDeviceInterface::initialize(
|
|
|
243
276
|
|
|
244
277
|
// Create parser. Default values that aren't obvious are taken from DALI.
|
|
245
278
|
CUVIDPARSERPARAMS parserParams = {};
|
|
246
|
-
|
|
279
|
+
auto codecType = validateCodecSupport(codecPar->codec_id);
|
|
280
|
+
TORCH_CHECK(
|
|
281
|
+
codecType.has_value(),
|
|
282
|
+
"This should never happen, we should be using the CPU fallback by now. Please report a bug.");
|
|
283
|
+
parserParams.CodecType = codecType.value();
|
|
247
284
|
parserParams.ulMaxNumDecodeSurfaces = 8;
|
|
248
285
|
parserParams.ulMaxDisplayDelay = 0;
|
|
249
286
|
// Callback setup, all are triggered by the parser within a call
|
|
@@ -362,11 +399,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
|
|
|
362
399
|
}
|
|
363
400
|
|
|
364
401
|
if (!decoder_) {
|
|
365
|
-
decoder_ = NVDECCache::getCache(device_
|
|
402
|
+
decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
|
|
366
403
|
|
|
367
404
|
if (!decoder_) {
|
|
368
405
|
// TODONVDEC P2: consider re-configuring an existing decoder instead of
|
|
369
|
-
// re-creating one. See docs, see DALI.
|
|
406
|
+
// re-creating one. See docs, see DALI. Re-configuration doesn't seem to
|
|
407
|
+
// be enabled in DALI by default.
|
|
370
408
|
decoder_ = createDecoder(videoFormat);
|
|
371
409
|
}
|
|
372
410
|
|
|
@@ -382,6 +420,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
|
|
|
382
420
|
// Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
|
|
383
421
|
// the NVCUVID parser.
|
|
384
422
|
int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
|
|
423
|
+
if (cpuFallback_) {
|
|
424
|
+
return cpuFallback_->sendPacket(packet);
|
|
425
|
+
}
|
|
426
|
+
|
|
385
427
|
TORCH_CHECK(
|
|
386
428
|
packet.get() && packet->data && packet->size > 0,
|
|
387
429
|
"sendPacket received an empty packet, this is unexpected, please report.");
|
|
@@ -405,6 +447,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
|
|
|
405
447
|
}
|
|
406
448
|
|
|
407
449
|
int BetaCudaDeviceInterface::sendEOFPacket() {
|
|
450
|
+
if (cpuFallback_) {
|
|
451
|
+
return cpuFallback_->sendEOFPacket();
|
|
452
|
+
}
|
|
453
|
+
|
|
408
454
|
CUVIDSOURCEDATAPACKET cuvidPacket = {};
|
|
409
455
|
cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
|
|
410
456
|
eofSent_ = true;
|
|
@@ -466,6 +512,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
|
|
|
466
512
|
|
|
467
513
|
// Moral equivalent of avcodec_receive_frame().
|
|
468
514
|
int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
|
|
515
|
+
if (cpuFallback_) {
|
|
516
|
+
return cpuFallback_->receiveFrame(avFrame);
|
|
517
|
+
}
|
|
518
|
+
|
|
469
519
|
if (readyFrames_.empty()) {
|
|
470
520
|
// No frame found, instruct caller to try again later after sending more
|
|
471
521
|
// packets, or to stop if EOF was already sent.
|
|
@@ -480,8 +530,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
|
|
|
480
530
|
procParams.top_field_first = dispInfo.top_field_first;
|
|
481
531
|
procParams.unpaired_field = dispInfo.repeat_first_field < 0;
|
|
482
532
|
// We set the NVDEC stream to the current stream. It will be waited upon by
|
|
483
|
-
// the NPP stream before any color conversion.
|
|
484
|
-
// is in the default interface.
|
|
533
|
+
// the NPP stream before any color conversion.
|
|
485
534
|
// Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
|
|
486
535
|
// CUstream
|
|
487
536
|
procParams.output_stream = reinterpret_cast<CUstream>(
|
|
@@ -601,6 +650,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
|
|
|
601
650
|
}
|
|
602
651
|
|
|
603
652
|
void BetaCudaDeviceInterface::flush() {
|
|
653
|
+
if (cpuFallback_) {
|
|
654
|
+
cpuFallback_->flush();
|
|
655
|
+
return;
|
|
656
|
+
}
|
|
657
|
+
|
|
604
658
|
// The NVCUVID docs mention that after seeking, i.e. when flush() is called,
|
|
605
659
|
// we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
|
|
606
660
|
// don't say whether this should be an empty packet, or whether it should be a
|
|
@@ -618,8 +672,23 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
|
|
|
618
672
|
UniqueAVFrame& avFrame,
|
|
619
673
|
FrameOutput& frameOutput,
|
|
620
674
|
std::optional<torch::Tensor> preAllocatedOutputTensor) {
|
|
621
|
-
|
|
622
|
-
|
|
675
|
+
if (cpuFallback_) {
|
|
676
|
+
// CPU decoded frame - need to do CPU color conversion then transfer to GPU
|
|
677
|
+
FrameOutput cpuFrameOutput;
|
|
678
|
+
cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
|
|
679
|
+
|
|
680
|
+
// Transfer CPU frame to GPU
|
|
681
|
+
if (preAllocatedOutputTensor.has_value()) {
|
|
682
|
+
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
|
|
683
|
+
frameOutput.data = preAllocatedOutputTensor.value();
|
|
684
|
+
} else {
|
|
685
|
+
frameOutput.data = cpuFrameOutput.data.to(device_);
|
|
686
|
+
}
|
|
687
|
+
return;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
|
|
691
|
+
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
|
|
623
692
|
TORCH_CHECK(
|
|
624
693
|
avFrame->format == AV_PIX_FMT_CUDA,
|
|
625
694
|
"Expected CUDA format frame from BETA CUDA interface");
|
|
@@ -633,4 +702,17 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
|
|
|
633
702
|
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
|
|
634
703
|
}
|
|
635
704
|
|
|
705
|
+
std::string BetaCudaDeviceInterface::getDetails() {
|
|
706
|
+
std::string details = "Beta CUDA Device Interface.";
|
|
707
|
+
if (cpuFallback_) {
|
|
708
|
+
details += " Using CPU fallback.";
|
|
709
|
+
if (!nvcuvidAvailable_) {
|
|
710
|
+
details += " NVCUVID not available!";
|
|
711
|
+
}
|
|
712
|
+
} else {
|
|
713
|
+
details += " Using NVDEC.";
|
|
714
|
+
}
|
|
715
|
+
return details;
|
|
716
|
+
}
|
|
717
|
+
|
|
636
718
|
} // namespace facebook::torchcodec
|
|
@@ -40,7 +40,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
|
|
|
40
40
|
|
|
41
41
|
void initialize(
|
|
42
42
|
const AVStream* avStream,
|
|
43
|
-
const UniqueDecodingAVFormatContext& avFormatCtx
|
|
43
|
+
const UniqueDecodingAVFormatContext& avFormatCtx,
|
|
44
|
+
const SharedAVCodecContext& codecContext) override;
|
|
44
45
|
|
|
45
46
|
void convertAVFrameToFrameOutput(
|
|
46
47
|
UniqueAVFrame& avFrame,
|
|
@@ -48,10 +49,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
|
|
|
48
49
|
std::optional<torch::Tensor> preAllocatedOutputTensor =
|
|
49
50
|
std::nullopt) override;
|
|
50
51
|
|
|
51
|
-
bool canDecodePacketDirectly() const override {
|
|
52
|
-
return true;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
52
|
int sendPacket(ReferenceAVPacket& packet) override;
|
|
56
53
|
int sendEOFPacket() override;
|
|
57
54
|
int receiveFrame(UniqueAVFrame& avFrame) override;
|
|
@@ -62,6 +59,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
|
|
|
62
59
|
int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
|
|
63
60
|
int frameReadyInDisplayOrder(CUVIDPARSERDISPINFO* dispInfo);
|
|
64
61
|
|
|
62
|
+
std::string getDetails() override;
|
|
63
|
+
|
|
65
64
|
private:
|
|
66
65
|
int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
|
|
67
66
|
|
|
@@ -97,6 +96,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
|
|
|
97
96
|
|
|
98
97
|
// NPP context for color conversion
|
|
99
98
|
UniqueNppContext nppCtx_;
|
|
99
|
+
|
|
100
|
+
std::unique_ptr<DeviceInterface> cpuFallback_;
|
|
101
|
+
bool nvcuvidAvailable_ = false;
|
|
100
102
|
};
|
|
101
103
|
|
|
102
104
|
} // namespace facebook::torchcodec
|
torchcodec/_core/CMakeLists.txt
CHANGED
|
@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
if(ENABLE_CUDA)
|
|
102
|
-
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp)
|
|
102
|
+
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
|
|
103
103
|
endif()
|
|
104
104
|
|
|
105
105
|
set(core_library_dependencies
|
|
@@ -108,27 +108,9 @@ function(make_torchcodec_libraries
|
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
if(ENABLE_CUDA)
|
|
111
|
-
# Try to find NVCUVID. Try the normal way first. This should work locally.
|
|
112
|
-
find_library(NVCUVID_LIBRARY NAMES nvcuvid)
|
|
113
|
-
# If not found, try with version suffix, or hardcoded path. Appears
|
|
114
|
-
# to be necessary on the CI.
|
|
115
|
-
if(NOT NVCUVID_LIBRARY)
|
|
116
|
-
find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
|
|
117
|
-
endif()
|
|
118
|
-
if(NOT NVCUVID_LIBRARY)
|
|
119
|
-
set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
|
|
120
|
-
endif()
|
|
121
|
-
|
|
122
|
-
if(NVCUVID_LIBRARY)
|
|
123
|
-
message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
|
|
124
|
-
else()
|
|
125
|
-
message(FATAL_ERROR "Could not find NVCUVID library")
|
|
126
|
-
endif()
|
|
127
|
-
|
|
128
111
|
list(APPEND core_library_dependencies
|
|
129
112
|
${CUDA_nppi_LIBRARY}
|
|
130
113
|
${CUDA_nppicc_LIBRARY}
|
|
131
|
-
${NVCUVID_LIBRARY}
|
|
132
114
|
)
|
|
133
115
|
endif()
|
|
134
116
|
|
torchcodec/_core/CUDACommon.cpp
CHANGED
|
@@ -5,14 +5,12 @@
|
|
|
5
5
|
// LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
#include "src/torchcodec/_core/CUDACommon.h"
|
|
8
|
+
#include "src/torchcodec/_core/Cache.h" // for PerGpuCache
|
|
8
9
|
|
|
9
10
|
namespace facebook::torchcodec {
|
|
10
11
|
|
|
11
12
|
namespace {
|
|
12
13
|
|
|
13
|
-
// Pytorch can only handle up to 128 GPUs.
|
|
14
|
-
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
|
|
15
|
-
const int MAX_CUDA_GPUS = 128;
|
|
16
14
|
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
|
|
17
15
|
// Set to a positive number to have a cache of that size.
|
|
18
16
|
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
|
|
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
|
|
|
249
247
|
}
|
|
250
248
|
|
|
251
249
|
UniqueNppContext getNppStreamContext(const torch::Device& device) {
|
|
252
|
-
|
|
250
|
+
int deviceIndex = getDeviceIndex(device);
|
|
253
251
|
|
|
254
252
|
UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
|
|
255
253
|
if (nppCtx) {
|
|
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {
|
|
|
266
264
|
|
|
267
265
|
nppCtx = std::make_unique<NppStreamContext>();
|
|
268
266
|
cudaDeviceProp prop{};
|
|
269
|
-
cudaError_t err = cudaGetDeviceProperties(&prop,
|
|
267
|
+
cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
|
|
270
268
|
TORCH_CHECK(
|
|
271
269
|
err == cudaSuccess,
|
|
272
270
|
"cudaGetDeviceProperties failed: ",
|
|
273
271
|
cudaGetErrorString(err));
|
|
274
272
|
|
|
275
|
-
nppCtx->nCudaDeviceId =
|
|
273
|
+
nppCtx->nCudaDeviceId = deviceIndex;
|
|
276
274
|
nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
|
|
277
275
|
nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
|
|
278
276
|
nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
|
|
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
|
|
|
312
310
|
}
|
|
313
311
|
}
|
|
314
312
|
|
|
313
|
+
int getDeviceIndex(const torch::Device& device) {
|
|
314
|
+
// PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
|
|
315
|
+
// libraries use int. So we use int, too.
|
|
316
|
+
int deviceIndex = static_cast<int>(device.index());
|
|
317
|
+
TORCH_CHECK(
|
|
318
|
+
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
|
|
319
|
+
"Invalid device index = ",
|
|
320
|
+
deviceIndex);
|
|
321
|
+
|
|
322
|
+
if (deviceIndex == -1) {
|
|
323
|
+
TORCH_CHECK(
|
|
324
|
+
cudaGetDevice(&deviceIndex) == cudaSuccess,
|
|
325
|
+
"Failed to get current CUDA device.");
|
|
326
|
+
}
|
|
327
|
+
return deviceIndex;
|
|
328
|
+
}
|
|
329
|
+
|
|
315
330
|
} // namespace facebook::torchcodec
|
torchcodec/_core/CUDACommon.h
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
#include <npp.h>
|
|
12
12
|
#include <torch/types.h>
|
|
13
13
|
|
|
14
|
-
#include "src/torchcodec/_core/Cache.h"
|
|
15
14
|
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
16
15
|
#include "src/torchcodec/_core/Frame.h"
|
|
17
16
|
|
|
@@ -22,6 +21,10 @@ extern "C" {
|
|
|
22
21
|
|
|
23
22
|
namespace facebook::torchcodec {
|
|
24
23
|
|
|
24
|
+
// Pytorch can only handle up to 128 GPUs.
|
|
25
|
+
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
|
|
26
|
+
constexpr int MAX_CUDA_GPUS = 128;
|
|
27
|
+
|
|
25
28
|
void initializeCudaContextWithPytorch(const torch::Device& device);
|
|
26
29
|
|
|
27
30
|
// Unique pointer type for NPP stream context
|
|
@@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
|
|
|
43
46
|
const std::optional<torch::Tensor>& preAllocatedOutputTensor,
|
|
44
47
|
const UniqueAVFrame& avFrame);
|
|
45
48
|
|
|
49
|
+
int getDeviceIndex(const torch::Device& device);
|
|
50
|
+
|
|
46
51
|
} // namespace facebook::torchcodec
|