torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.1__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/AVIOTensorContext.cpp +23 -16
- torchcodec/_core/AVIOTensorContext.h +2 -1
- torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
- torchcodec/_core/CMakeLists.txt +18 -3
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +6 -20
- torchcodec/_core/CpuDeviceInterface.cpp +195 -108
- torchcodec/_core/CpuDeviceInterface.h +84 -19
- torchcodec/_core/CudaDeviceInterface.cpp +227 -376
- torchcodec/_core/CudaDeviceInterface.h +38 -6
- torchcodec/_core/DeviceInterface.cpp +57 -19
- torchcodec/_core/DeviceInterface.h +97 -16
- torchcodec/_core/Encoder.cpp +346 -9
- torchcodec/_core/Encoder.h +62 -1
- torchcodec/_core/FFMPEGCommon.cpp +190 -3
- torchcodec/_core/FFMPEGCommon.h +27 -1
- torchcodec/_core/FilterGraph.cpp +30 -22
- torchcodec/_core/FilterGraph.h +15 -1
- torchcodec/_core/Frame.cpp +22 -7
- torchcodec/_core/Frame.h +15 -61
- torchcodec/_core/Metadata.h +2 -2
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +196 -201
- torchcodec/_core/SingleStreamDecoder.h +42 -15
- torchcodec/_core/StreamOptions.h +16 -6
- torchcodec/_core/Transform.cpp +87 -0
- torchcodec/_core/Transform.h +84 -0
- torchcodec/_core/__init__.py +4 -0
- torchcodec/_core/custom_ops.cpp +257 -32
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +147 -44
- torchcodec/_core/pybind_ops.cpp +22 -59
- torchcodec/_samplers/video_clip_sampler.py +7 -19
- torchcodec/decoders/__init__.py +1 -0
- torchcodec/decoders/_decoder_utils.py +61 -1
- torchcodec/decoders/_video_decoder.py +46 -20
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/samplers/_time_based.py +8 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
- torchcodec-0.8.1.dist-info/RECORD +82 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
- torchcodec-0.7.0.dist-info/RECORD +0 -67
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
#include <ATen/cuda/CUDAEvent.h>
|
|
2
2
|
#include <c10/cuda/CUDAStream.h>
|
|
3
|
-
#include <npp.h>
|
|
4
3
|
#include <torch/types.h>
|
|
5
4
|
#include <mutex>
|
|
6
5
|
|
|
@@ -16,18 +15,12 @@ extern "C" {
|
|
|
16
15
|
namespace facebook::torchcodec {
|
|
17
16
|
namespace {
|
|
18
17
|
|
|
19
|
-
static bool g_cuda =
|
|
20
|
-
|
|
18
|
+
static bool g_cuda = registerDeviceInterface(
|
|
19
|
+
DeviceInterfaceKey(torch::kCUDA),
|
|
20
|
+
[](const torch::Device& device) {
|
|
21
21
|
return new CudaDeviceInterface(device);
|
|
22
22
|
});
|
|
23
23
|
|
|
24
|
-
// BT.709 full range color conversion matrix for YUV to RGB conversion.
|
|
25
|
-
// See Note [YUV -> RGB Color Conversion, color space and color range] below.
|
|
26
|
-
constexpr Npp32f bt709FullRangeColorTwist[3][4] = {
|
|
27
|
-
{1.0f, 0.0f, 1.5748f, 0.0f},
|
|
28
|
-
{1.0f, -0.187324273f, -0.468124273f, -128.0f},
|
|
29
|
-
{1.0f, 1.8556f, 0.0f, -128.0f}};
|
|
30
|
-
|
|
31
24
|
// We reuse cuda contexts across VideoDeoder instances. This is because
|
|
32
25
|
// creating a cuda context is expensive. The cache mechanism is as follows:
|
|
33
26
|
// 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for
|
|
@@ -39,39 +32,48 @@ constexpr Npp32f bt709FullRangeColorTwist[3][4] = {
|
|
|
39
32
|
// from
|
|
40
33
|
// the cache. If the cache is empty we create a new cuda context.
|
|
41
34
|
|
|
42
|
-
// Pytorch can only handle up to 128 GPUs.
|
|
43
|
-
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
|
|
44
|
-
const int MAX_CUDA_GPUS = 128;
|
|
45
35
|
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
|
|
46
36
|
// Set to a positive number to have a cache of that size.
|
|
47
37
|
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
|
|
48
38
|
PerGpuCache<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>
|
|
49
39
|
g_cached_hw_device_ctxs(MAX_CUDA_GPUS, MAX_CONTEXTS_PER_GPU_IN_CACHE);
|
|
50
|
-
PerGpuCache<NppStreamContext> g_cached_npp_ctxs(
|
|
51
|
-
MAX_CUDA_GPUS,
|
|
52
|
-
MAX_CONTEXTS_PER_GPU_IN_CACHE);
|
|
53
40
|
|
|
41
|
+
int getFlagsAVHardwareDeviceContextCreate() {
|
|
42
|
+
// 58.26.100 introduced the concept of reusing the existing cuda context
|
|
43
|
+
// which is much faster and lower memory than creating a new cuda context.
|
|
54
44
|
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
|
|
45
|
+
return AV_CUDA_USE_CURRENT_CONTEXT;
|
|
46
|
+
#else
|
|
47
|
+
return 0;
|
|
48
|
+
#endif
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
|
|
52
|
+
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
|
|
53
|
+
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
|
|
54
|
+
int deviceIndex = getDeviceIndex(device);
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
56
|
+
UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
|
|
57
|
+
if (hardwareDeviceCtx) {
|
|
58
|
+
return hardwareDeviceCtx;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Create hardware device context
|
|
60
62
|
c10::cuda::CUDAGuard deviceGuard(device);
|
|
61
|
-
// Valid values for the argument to cudaSetDevice are 0 to maxDevices - 1:
|
|
62
|
-
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb
|
|
63
|
-
// So we ensure the deviceIndex is not negative.
|
|
64
63
|
// We set the device because we may be called from a different thread than
|
|
65
64
|
// the one that initialized the cuda context.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
TORCH_CHECK(
|
|
66
|
+
cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device");
|
|
67
|
+
AVBufferRef* hardwareDeviceCtxRaw = nullptr;
|
|
68
|
+
std::string deviceOrdinal = std::to_string(deviceIndex);
|
|
69
|
+
|
|
69
70
|
int err = av_hwdevice_ctx_create(
|
|
70
|
-
&
|
|
71
|
+
&hardwareDeviceCtxRaw,
|
|
71
72
|
type,
|
|
72
73
|
deviceOrdinal.c_str(),
|
|
73
74
|
nullptr,
|
|
74
|
-
|
|
75
|
+
getFlagsAVHardwareDeviceContextCreate());
|
|
76
|
+
|
|
75
77
|
if (err < 0) {
|
|
76
78
|
/* clang-format off */
|
|
77
79
|
TORCH_CHECK(
|
|
@@ -82,288 +84,249 @@ AVBufferRef* getFFMPEGContextFromExistingCudaContext(
|
|
|
82
84
|
"). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
|
|
83
85
|
/* clang-format on */
|
|
84
86
|
}
|
|
85
|
-
return hw_device_ctx;
|
|
86
|
-
}
|
|
87
87
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
AVBufferRef* getFFMPEGContextFromNewCudaContext(
|
|
91
|
-
[[maybe_unused]] const torch::Device& device,
|
|
92
|
-
torch::DeviceIndex nonNegativeDeviceIndex,
|
|
93
|
-
enum AVHWDeviceType type) {
|
|
94
|
-
AVBufferRef* hw_device_ctx = nullptr;
|
|
95
|
-
std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
|
|
96
|
-
int err = av_hwdevice_ctx_create(
|
|
97
|
-
&hw_device_ctx, type, deviceOrdinal.c_str(), nullptr, 0);
|
|
98
|
-
if (err < 0) {
|
|
99
|
-
TORCH_CHECK(
|
|
100
|
-
false,
|
|
101
|
-
"Failed to create specified HW device",
|
|
102
|
-
getFFMPEGErrorStringFromErrorCode(err));
|
|
103
|
-
}
|
|
104
|
-
return hw_device_ctx;
|
|
88
|
+
return UniqueAVBufferRef(hardwareDeviceCtxRaw);
|
|
105
89
|
}
|
|
106
90
|
|
|
107
|
-
|
|
91
|
+
} // namespace
|
|
108
92
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
TORCH_CHECK(
|
|
112
|
-
|
|
93
|
+
CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
|
|
94
|
+
: DeviceInterface(device) {
|
|
95
|
+
TORCH_CHECK(g_cuda, "CudaDeviceInterface was not registered!");
|
|
96
|
+
TORCH_CHECK(
|
|
97
|
+
device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
|
|
113
98
|
|
|
114
|
-
|
|
115
|
-
if (hw_device_ctx) {
|
|
116
|
-
return hw_device_ctx;
|
|
117
|
-
}
|
|
99
|
+
initializeCudaContextWithPytorch(device_);
|
|
118
100
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
// So we try to use that if it is available.
|
|
122
|
-
// FFMPEG 6.1.2 appears to be the earliest release that contains version
|
|
123
|
-
// 58.26.100 of avutil.
|
|
124
|
-
// https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
|
|
125
|
-
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
|
|
126
|
-
return UniqueAVBufferRef(getFFMPEGContextFromExistingCudaContext(
|
|
127
|
-
device, nonNegativeDeviceIndex, type));
|
|
128
|
-
#else
|
|
129
|
-
return UniqueAVBufferRef(
|
|
130
|
-
getFFMPEGContextFromNewCudaContext(device, nonNegativeDeviceIndex, type));
|
|
131
|
-
#endif
|
|
101
|
+
hardwareDeviceCtx_ = getHardwareDeviceContext(device_);
|
|
102
|
+
nppCtx_ = getNppStreamContext(device_);
|
|
132
103
|
}
|
|
133
104
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
std::unique_ptr<NppStreamContext> nppCtx = g_cached_npp_ctxs.get(device);
|
|
139
|
-
if (nppCtx) {
|
|
140
|
-
return nppCtx;
|
|
105
|
+
CudaDeviceInterface::~CudaDeviceInterface() {
|
|
106
|
+
if (hardwareDeviceCtx_) {
|
|
107
|
+
g_cached_hw_device_ctxs.addIfCacheHasCapacity(
|
|
108
|
+
device_, std::move(hardwareDeviceCtx_));
|
|
141
109
|
}
|
|
110
|
+
returnNppStreamContextToCache(device_, std::move(nppCtx_));
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
void CudaDeviceInterface::initialize(
|
|
114
|
+
const AVStream* avStream,
|
|
115
|
+
const UniqueDecodingAVFormatContext& avFormatCtx,
|
|
116
|
+
const SharedAVCodecContext& codecContext) {
|
|
117
|
+
TORCH_CHECK(avStream != nullptr, "avStream is null");
|
|
118
|
+
codecContext_ = codecContext;
|
|
119
|
+
timeBase_ = avStream->time_base;
|
|
142
120
|
|
|
143
|
-
//
|
|
144
|
-
|
|
145
|
-
// https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
|
|
146
|
-
// And the nppGetStreamContext() helper is deprecated. We are explicitly
|
|
147
|
-
// supposed to create the NppStreamContext manually from the CUDA device
|
|
148
|
-
// properties:
|
|
149
|
-
// https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72
|
|
150
|
-
|
|
151
|
-
nppCtx = std::make_unique<NppStreamContext>();
|
|
152
|
-
cudaDeviceProp prop{};
|
|
153
|
-
cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
|
|
121
|
+
// TODO: Ideally, we should keep all interface implementations independent.
|
|
122
|
+
cpuInterface_ = createDeviceInterface(torch::kCPU);
|
|
154
123
|
TORCH_CHECK(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
|
|
162
|
-
nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
|
|
163
|
-
nppCtx->nSharedMemPerBlock = prop.sharedMemPerBlock;
|
|
164
|
-
nppCtx->nCudaDevAttrComputeCapabilityMajor = prop.major;
|
|
165
|
-
nppCtx->nCudaDevAttrComputeCapabilityMinor = prop.minor;
|
|
166
|
-
|
|
167
|
-
return nppCtx;
|
|
124
|
+
cpuInterface_ != nullptr, "Failed to create CPU device interface");
|
|
125
|
+
cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
|
|
126
|
+
cpuInterface_->initializeVideo(
|
|
127
|
+
VideoStreamOptions(),
|
|
128
|
+
{},
|
|
129
|
+
/*resizedOutputDims=*/std::nullopt);
|
|
168
130
|
}
|
|
169
131
|
|
|
170
|
-
|
|
132
|
+
void CudaDeviceInterface::initializeVideo(
|
|
133
|
+
const VideoStreamOptions& videoStreamOptions,
|
|
134
|
+
[[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
|
|
135
|
+
[[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {
|
|
136
|
+
videoStreamOptions_ = videoStreamOptions;
|
|
137
|
+
}
|
|
171
138
|
|
|
172
|
-
CudaDeviceInterface::
|
|
173
|
-
|
|
174
|
-
TORCH_CHECK(g_cuda, "CudaDeviceInterface was not registered!");
|
|
139
|
+
void CudaDeviceInterface::registerHardwareDeviceWithCodec(
|
|
140
|
+
AVCodecContext* codecContext) {
|
|
175
141
|
TORCH_CHECK(
|
|
176
|
-
|
|
142
|
+
hardwareDeviceCtx_, "Hardware device context has not been initialized");
|
|
143
|
+
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
|
|
144
|
+
codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
|
|
177
145
|
}
|
|
178
146
|
|
|
179
|
-
CudaDeviceInterface
|
|
180
|
-
|
|
181
|
-
|
|
147
|
+
UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
|
|
148
|
+
UniqueAVFrame& avFrame) {
|
|
149
|
+
// We need FFmpeg filters to handle those conversion cases which are not
|
|
150
|
+
// directly implemented in CUDA or CPU device interface (in case of a
|
|
151
|
+
// fallback).
|
|
152
|
+
|
|
153
|
+
// Input frame is on CPU, we will just pass it to CPU device interface, so
|
|
154
|
+
// skipping filters context as CPU device interface will handle everything for
|
|
155
|
+
// us.
|
|
156
|
+
if (avFrame->format != AV_PIX_FMT_CUDA) {
|
|
157
|
+
return std::move(avFrame);
|
|
182
158
|
}
|
|
183
|
-
|
|
184
|
-
|
|
159
|
+
|
|
160
|
+
auto hwFramesCtx =
|
|
161
|
+
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
|
|
162
|
+
TORCH_CHECK(
|
|
163
|
+
hwFramesCtx != nullptr,
|
|
164
|
+
"The AVFrame does not have a hw_frames_ctx. "
|
|
165
|
+
"That's unexpected, please report this to the TorchCodec repo.");
|
|
166
|
+
|
|
167
|
+
AVPixelFormat actualFormat = hwFramesCtx->sw_format;
|
|
168
|
+
|
|
169
|
+
// If the frame is already in NV12 format, we don't need to do anything.
|
|
170
|
+
if (actualFormat == AV_PIX_FMT_NV12) {
|
|
171
|
+
return std::move(avFrame);
|
|
185
172
|
}
|
|
186
|
-
}
|
|
187
173
|
|
|
188
|
-
|
|
189
|
-
|
|
174
|
+
AVPixelFormat outputFormat;
|
|
175
|
+
std::stringstream filters;
|
|
190
176
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
177
|
+
unsigned version_int = avfilter_version();
|
|
178
|
+
if (version_int < AV_VERSION_INT(8, 0, 103)) {
|
|
179
|
+
// Color conversion support ('format=' option) was added to scale_cuda from
|
|
180
|
+
// n5.0. With the earlier version of ffmpeg we have no choice but use CPU
|
|
181
|
+
// filters. See:
|
|
182
|
+
// https://github.com/FFmpeg/FFmpeg/commit/62dc5df941f5e196164c151691e4274195523e95
|
|
183
|
+
outputFormat = AV_PIX_FMT_RGB24;
|
|
184
|
+
|
|
185
|
+
auto actualFormatName = av_get_pix_fmt_name(actualFormat);
|
|
186
|
+
TORCH_CHECK(
|
|
187
|
+
actualFormatName != nullptr,
|
|
188
|
+
"The actual format of a frame is unknown to FFmpeg. "
|
|
189
|
+
"That's unexpected, please report this to the TorchCodec repo.");
|
|
190
|
+
|
|
191
|
+
filters << "hwdownload,format=" << actualFormatName;
|
|
192
|
+
} else {
|
|
193
|
+
// Actual output color format will be set via filter options
|
|
194
|
+
outputFormat = AV_PIX_FMT_CUDA;
|
|
195
|
+
|
|
196
|
+
filters << "scale_cuda=format=nv12:interp_algo=bilinear";
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
enum AVPixelFormat frameFormat =
|
|
200
|
+
static_cast<enum AVPixelFormat>(avFrame->format);
|
|
201
|
+
|
|
202
|
+
auto newContext = std::make_unique<FiltersContext>(
|
|
203
|
+
avFrame->width,
|
|
204
|
+
avFrame->height,
|
|
205
|
+
frameFormat,
|
|
206
|
+
avFrame->sample_aspect_ratio,
|
|
207
|
+
avFrame->width,
|
|
208
|
+
avFrame->height,
|
|
209
|
+
outputFormat,
|
|
210
|
+
filters.str(),
|
|
211
|
+
timeBase_,
|
|
212
|
+
av_buffer_ref(avFrame->hw_frames_ctx));
|
|
213
|
+
|
|
214
|
+
if (!nv12Conversion_ || *nv12ConversionContext_ != *newContext) {
|
|
215
|
+
nv12Conversion_ =
|
|
216
|
+
std::make_unique<FilterGraph>(*newContext, videoStreamOptions_);
|
|
217
|
+
nv12ConversionContext_ = std::move(newContext);
|
|
218
|
+
}
|
|
219
|
+
auto filteredAVFrame = nv12Conversion_->convert(avFrame);
|
|
220
|
+
|
|
221
|
+
// If this check fails it means the frame wasn't
|
|
222
|
+
// reshaped to its expected dimensions by filtergraph.
|
|
223
|
+
TORCH_CHECK(
|
|
224
|
+
(filteredAVFrame->width == nv12ConversionContext_->outputWidth) &&
|
|
225
|
+
(filteredAVFrame->height == nv12ConversionContext_->outputHeight),
|
|
226
|
+
"Expected frame from filter graph of ",
|
|
227
|
+
nv12ConversionContext_->outputWidth,
|
|
228
|
+
"x",
|
|
229
|
+
nv12ConversionContext_->outputHeight,
|
|
230
|
+
", got ",
|
|
231
|
+
filteredAVFrame->width,
|
|
232
|
+
"x",
|
|
233
|
+
filteredAVFrame->height);
|
|
234
|
+
|
|
235
|
+
return filteredAVFrame;
|
|
200
236
|
}
|
|
201
237
|
|
|
202
238
|
void CudaDeviceInterface::convertAVFrameToFrameOutput(
|
|
203
|
-
const VideoStreamOptions& videoStreamOptions,
|
|
204
|
-
[[maybe_unused]] const AVRational& timeBase,
|
|
205
239
|
UniqueAVFrame& avFrame,
|
|
206
240
|
FrameOutput& frameOutput,
|
|
207
241
|
std::optional<torch::Tensor> preAllocatedOutputTensor) {
|
|
242
|
+
validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
|
|
243
|
+
|
|
244
|
+
// All of our CUDA decoding assumes NV12 format. We handle non-NV12 formats by
|
|
245
|
+
// converting them to NV12.
|
|
246
|
+
avFrame = maybeConvertAVFrameToNV12OrRGB24(avFrame);
|
|
247
|
+
|
|
208
248
|
if (avFrame->format != AV_PIX_FMT_CUDA) {
|
|
209
249
|
// The frame's format is AV_PIX_FMT_CUDA if and only if its content is on
|
|
210
|
-
// the GPU. In this branch, the frame is on the CPU
|
|
211
|
-
//
|
|
212
|
-
//
|
|
213
|
-
//
|
|
214
|
-
//
|
|
215
|
-
//
|
|
216
|
-
//
|
|
217
|
-
|
|
218
|
-
|
|
250
|
+
// the GPU. In this branch, the frame is on the CPU. There are two possible
|
|
251
|
+
// reasons:
|
|
252
|
+
//
|
|
253
|
+
// 1. During maybeConvertAVFrameToNV12OrRGB24(), we had a non-NV12 format
|
|
254
|
+
// frame and we're on FFmpeg 4.4 or earlier. In such cases, we had to
|
|
255
|
+
// use CPU filters and we just converted the frame to RGB24.
|
|
256
|
+
// 2. This is what NVDEC gave us if it wasn't able to decode a frame, for
|
|
257
|
+
// whatever reason. Typically that happens if the video's encoder isn't
|
|
258
|
+
// supported by NVDEC.
|
|
259
|
+
//
|
|
260
|
+
// In both cases, we have a frame on the CPU. We send the frame back to the
|
|
261
|
+
// CUDA device when we're done.
|
|
262
|
+
|
|
263
|
+
enum AVPixelFormat frameFormat =
|
|
264
|
+
static_cast<enum AVPixelFormat>(avFrame->format);
|
|
219
265
|
|
|
220
266
|
FrameOutput cpuFrameOutput;
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
267
|
+
if (frameFormat == AV_PIX_FMT_RGB24) {
|
|
268
|
+
// Reason 1 above. The frame is already in RGB24, we just need to convert
|
|
269
|
+
// it to a tensor.
|
|
270
|
+
cpuFrameOutput.data = rgbAVFrameToTensor(avFrame);
|
|
271
|
+
} else {
|
|
272
|
+
// Reason 2 above. We need to do a full conversion which requires an
|
|
273
|
+
// actual CPU device.
|
|
274
|
+
cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Finally, we need to send the frame back to the GPU. Note that the
|
|
278
|
+
// pre-allocated tensor is on the GPU, so we can't send that to the CPU
|
|
279
|
+
// device interface. We copy it over here.
|
|
280
|
+
if (preAllocatedOutputTensor.has_value()) {
|
|
281
|
+
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
|
|
282
|
+
frameOutput.data = preAllocatedOutputTensor.value();
|
|
283
|
+
} else {
|
|
284
|
+
frameOutput.data = cpuFrameOutput.data.to(device_);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
usingCPUFallback_ = true;
|
|
229
288
|
return;
|
|
230
289
|
}
|
|
231
290
|
|
|
291
|
+
usingCPUFallback_ = false;
|
|
292
|
+
|
|
232
293
|
// Above we checked that the AVFrame was on GPU, but that's not enough, we
|
|
233
294
|
// also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
|
|
234
|
-
// because this is what the NPP color conversion routines expect.
|
|
235
|
-
//
|
|
236
|
-
// non-8bit videos. This is supported on CPU.
|
|
295
|
+
// because this is what the NPP color conversion routines expect. This SHOULD
|
|
296
|
+
// be enforced by our call to maybeConvertAVFrameToNV12OrRGB24() above.
|
|
237
297
|
TORCH_CHECK(
|
|
238
298
|
avFrame->hw_frames_ctx != nullptr,
|
|
239
|
-
"The AVFrame does not have a hw_frames_ctx. "
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
auto hwFramesCtx =
|
|
299
|
+
"The AVFrame does not have a hw_frames_ctx. This should never happen");
|
|
300
|
+
AVHWFramesContext* hwFramesCtx =
|
|
243
301
|
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
|
|
302
|
+
TORCH_CHECK(
|
|
303
|
+
hwFramesCtx != nullptr,
|
|
304
|
+
"The AVFrame does not have a valid hw_frames_ctx. This should never happen");
|
|
305
|
+
|
|
244
306
|
AVPixelFormat actualFormat = hwFramesCtx->sw_format;
|
|
245
307
|
TORCH_CHECK(
|
|
246
308
|
actualFormat == AV_PIX_FMT_NV12,
|
|
247
309
|
"The AVFrame is ",
|
|
248
310
|
(av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
|
|
249
311
|
: "unknown"),
|
|
250
|
-
", but we expected AV_PIX_FMT_NV12.
|
|
251
|
-
"
|
|
252
|
-
"Try using the CPU device instead. "
|
|
253
|
-
"If the video is 10bit, we are tracking 10bit support in "
|
|
254
|
-
"https://github.com/pytorch/torchcodec/issues/776");
|
|
255
|
-
|
|
256
|
-
auto frameDims =
|
|
257
|
-
getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
|
|
258
|
-
int height = frameDims.height;
|
|
259
|
-
int width = frameDims.width;
|
|
260
|
-
torch::Tensor& dst = frameOutput.data;
|
|
261
|
-
if (preAllocatedOutputTensor.has_value()) {
|
|
262
|
-
dst = preAllocatedOutputTensor.value();
|
|
263
|
-
auto shape = dst.sizes();
|
|
264
|
-
TORCH_CHECK(
|
|
265
|
-
(shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
|
|
266
|
-
(shape[2] == 3),
|
|
267
|
-
"Expected tensor of shape ",
|
|
268
|
-
height,
|
|
269
|
-
"x",
|
|
270
|
-
width,
|
|
271
|
-
"x3, got ",
|
|
272
|
-
shape);
|
|
273
|
-
} else {
|
|
274
|
-
dst = allocateEmptyHWCTensor(height, width, device_);
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
|
|
312
|
+
", but we expected AV_PIX_FMT_NV12. "
|
|
313
|
+
"That's unexpected, please report this to the TorchCodec repo.");
|
|
278
314
|
|
|
279
|
-
//
|
|
280
|
-
//
|
|
281
|
-
//
|
|
282
|
-
// functions, to ensure NVDEC has finished decoding the frame before running
|
|
283
|
-
// the NPP color-conversion.
|
|
284
|
-
// Note that our code is generic and assumes that the NVDEC's stream can be
|
|
285
|
-
// arbitrary, but unfortunately we know it's hardcoded to be the default
|
|
286
|
-
// stream by FFmpeg:
|
|
315
|
+
// Figure out the NVDEC stream from the avFrame's hardware context.
|
|
316
|
+
// In reality, we know that this stream is hardcoded to be the default stream
|
|
317
|
+
// by FFmpeg:
|
|
287
318
|
// https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
|
|
288
319
|
TORCH_CHECK(
|
|
289
320
|
hwFramesCtx->device_ctx != nullptr,
|
|
290
321
|
"The AVFrame's hw_frames_ctx does not have a device_ctx. ");
|
|
291
322
|
auto cudaDeviceCtx =
|
|
292
323
|
static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
|
|
293
|
-
|
|
324
|
+
TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
|
|
294
325
|
at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
|
|
295
|
-
c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream,
|
|
296
|
-
nvdecDoneEvent.record(nvdecStream);
|
|
326
|
+
c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, device_.index());
|
|
297
327
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
nvdecDoneEvent.block(nppStream);
|
|
301
|
-
|
|
302
|
-
// Create the NPP context if we haven't yet.
|
|
303
|
-
nppCtx_->hStream = nppStream.stream();
|
|
304
|
-
cudaError_t err =
|
|
305
|
-
cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
|
|
306
|
-
TORCH_CHECK(
|
|
307
|
-
err == cudaSuccess,
|
|
308
|
-
"cudaStreamGetFlags failed: ",
|
|
309
|
-
cudaGetErrorString(err));
|
|
310
|
-
|
|
311
|
-
NppiSize oSizeROI = {width, height};
|
|
312
|
-
Npp8u* yuvData[2] = {avFrame->data[0], avFrame->data[1]};
|
|
313
|
-
|
|
314
|
-
NppStatus status;
|
|
315
|
-
|
|
316
|
-
// For background, see
|
|
317
|
-
// Note [YUV -> RGB Color Conversion, color space and color range]
|
|
318
|
-
if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
|
|
319
|
-
if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
|
|
320
|
-
// NPP provides a pre-defined color conversion function for BT.709 full
|
|
321
|
-
// range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely
|
|
322
|
-
// matching the results we have on CPU. So we're using a custom color
|
|
323
|
-
// conversion matrix, which provides more accurate results. See the note
|
|
324
|
-
// mentioned above for details, and headaches.
|
|
325
|
-
|
|
326
|
-
int srcStep[2] = {avFrame->linesize[0], avFrame->linesize[1]};
|
|
327
|
-
|
|
328
|
-
status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx(
|
|
329
|
-
yuvData,
|
|
330
|
-
srcStep,
|
|
331
|
-
static_cast<Npp8u*>(dst.data_ptr()),
|
|
332
|
-
dst.stride(0),
|
|
333
|
-
oSizeROI,
|
|
334
|
-
bt709FullRangeColorTwist,
|
|
335
|
-
*nppCtx_);
|
|
336
|
-
} else {
|
|
337
|
-
// If not full range, we assume studio limited range.
|
|
338
|
-
// The color conversion matrix for BT.709 limited range should be:
|
|
339
|
-
// static const Npp32f bt709LimitedRangeColorTwist[3][4] = {
|
|
340
|
-
// {1.16438356f, 0.0f, 1.79274107f, -16.0f},
|
|
341
|
-
// {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f},
|
|
342
|
-
// {1.16438356f, 2.11240179f, 0.0f, -128.0f}
|
|
343
|
-
// };
|
|
344
|
-
// We get very close results to CPU with that, but using the pre-defined
|
|
345
|
-
// nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate.
|
|
346
|
-
status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
|
|
347
|
-
yuvData,
|
|
348
|
-
avFrame->linesize[0],
|
|
349
|
-
static_cast<Npp8u*>(dst.data_ptr()),
|
|
350
|
-
dst.stride(0),
|
|
351
|
-
oSizeROI,
|
|
352
|
-
*nppCtx_);
|
|
353
|
-
}
|
|
354
|
-
} else {
|
|
355
|
-
// TODO we're assuming BT.601 color space (and probably limited range) by
|
|
356
|
-
// calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range,
|
|
357
|
-
// and other color-spaces like 2020.
|
|
358
|
-
status = nppiNV12ToRGB_8u_P2C3R_Ctx(
|
|
359
|
-
yuvData,
|
|
360
|
-
avFrame->linesize[0],
|
|
361
|
-
static_cast<Npp8u*>(dst.data_ptr()),
|
|
362
|
-
dst.stride(0),
|
|
363
|
-
oSizeROI,
|
|
364
|
-
*nppCtx_);
|
|
365
|
-
}
|
|
366
|
-
TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
|
|
328
|
+
frameOutput.data = convertNV12FrameToRGB(
|
|
329
|
+
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
|
|
367
330
|
}
|
|
368
331
|
|
|
369
332
|
// inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
|
|
@@ -391,124 +354,12 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
|
|
|
391
354
|
return std::nullopt;
|
|
392
355
|
}
|
|
393
356
|
|
|
394
|
-
|
|
357
|
+
std::string CudaDeviceInterface::getDetails() {
|
|
358
|
+
// Note: for this interface specifically the fallback is only known after a
|
|
359
|
+
// frame has been decoded, not before: that's when FFmpeg decides to fallback,
|
|
360
|
+
// so we can't know earlier.
|
|
361
|
+
return std::string("FFmpeg CUDA Device Interface. Using ") +
|
|
362
|
+
(usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
|
|
363
|
+
}
|
|
395
364
|
|
|
396
|
-
|
|
397
|
-
// Note: [YUV -> RGB Color Conversion, color space and color range]
|
|
398
|
-
//
|
|
399
|
-
// The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV
|
|
400
|
-
// format. We need to convert them to RGB. This note attempts to describe this
|
|
401
|
-
// process. There may be some inaccuracies and approximations that experts will
|
|
402
|
-
// notice, but our goal is only to provide a good enough understanding of the
|
|
403
|
-
// process for torchcodec developers to implement and maintain it.
|
|
404
|
-
// On CPU, filtergraph and swscale handle everything for us. With CUDA, we have
|
|
405
|
-
// to do a lot of the heavy lifting ourselves.
|
|
406
|
-
//
|
|
407
|
-
// Color space and color range
|
|
408
|
-
// ---------------------------
|
|
409
|
-
// Two main characteristics of a frame will affect the conversion process:
|
|
410
|
-
// 1. Color space: This basically defines what YUV values correspond to which
|
|
411
|
-
// physical wavelength. No need to go into details here,the point is that
|
|
412
|
-
// videos can come in different color spaces, the most common ones being
|
|
413
|
-
// BT.601 and BT.709, but there are others.
|
|
414
|
-
// In FFmpeg this is represented with AVColorSpace:
|
|
415
|
-
// https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85
|
|
416
|
-
// 2. Color range: This defines the range of YUV values. There is:
|
|
417
|
-
// - full range, also called PC range: AVCOL_RANGE_JPEG
|
|
418
|
-
// - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG
|
|
419
|
-
// https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a
|
|
420
|
-
//
|
|
421
|
-
// Color space and color range are independent concepts, so we can have a BT.709
|
|
422
|
-
// with full range, and another one with limited range. Same for BT.601.
|
|
423
|
-
//
|
|
424
|
-
// In the first version of this note we'll focus on the full color range. It
|
|
425
|
-
// will later be updated to account for the limited range.
|
|
426
|
-
//
|
|
427
|
-
// Color conversion matrix
|
|
428
|
-
// -----------------------
|
|
429
|
-
// YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV,
|
|
430
|
-
// So this is where we'll start.
|
|
431
|
-
// At the core of a RGB -> YUV conversion are the "luma coefficients", which are
|
|
432
|
-
// specific to a given color space and defined by the color space standard. In
|
|
433
|
-
// FFmpeg they can be found here:
|
|
434
|
-
// https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56
|
|
435
|
-
//
|
|
436
|
-
// For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722
|
|
437
|
-
// Coefficients must sum to 1.
|
|
438
|
-
//
|
|
439
|
-
// Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range
|
|
440
|
-
// (that's mathematically, in practice they are represented in integer range).
|
|
441
|
-
// The conversion is defined as:
|
|
442
|
-
// https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr
|
|
443
|
-
// Y = kr*R + kg*G + kb*B
|
|
444
|
-
// U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb)
|
|
445
|
-
// V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr)
|
|
446
|
-
//
|
|
447
|
-
// Putting all this into matrix form, we get:
|
|
448
|
-
// [Y] = [kr kg kb ] [R]
|
|
449
|
-
// [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G]
|
|
450
|
-
// [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B]
|
|
451
|
-
//
|
|
452
|
-
//
|
|
453
|
-
// Now, to convert YUV to RGB, we just need to invert this matrix:
|
|
454
|
-
// ```py
|
|
455
|
-
// import torch
|
|
456
|
-
// kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients
|
|
457
|
-
// u_scale = 2 * (1 - kb)
|
|
458
|
-
// v_scale = 2 * (1 - kr)
|
|
459
|
-
//
|
|
460
|
-
// rgb_to_yuv = torch.tensor([
|
|
461
|
-
// [kr, kg, kb],
|
|
462
|
-
// [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
|
|
463
|
-
// [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
|
|
464
|
-
// ])
|
|
465
|
-
//
|
|
466
|
-
// yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv)
|
|
467
|
-
// print("YUV->RGB matrix (Full Range):")
|
|
468
|
-
// print(yuv_to_rgb_full)
|
|
469
|
-
// ```
|
|
470
|
-
// And we get:
|
|
471
|
-
// tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00],
|
|
472
|
-
// [ 1.0000e+00, -1.8732e-01, -4.6812e-01],
|
|
473
|
-
// [ 1.0000e+00, 1.8556e+00, 4.6231e-09]])
|
|
474
|
-
//
|
|
475
|
-
// Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
|
|
476
|
-
//
|
|
477
|
-
// Color conversion in NPP
|
|
478
|
-
// -----------------------
|
|
479
|
-
// https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
|
|
480
|
-
//
|
|
481
|
-
// NPP provides different ways to convert YUV to RGB:
|
|
482
|
-
// - pre-defined color conversion functions like
|
|
483
|
-
// nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx
|
|
484
|
-
// which are for BT.709 limited and full range, respectively.
|
|
485
|
-
// - generic color conversion functions that accept a custom color conversion
|
|
486
|
-
// matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx
|
|
487
|
-
//
|
|
488
|
-
// We use the pre-defined functions or the color twist functions depending on
|
|
489
|
-
// which one we find to be closer to the CPU results.
|
|
490
|
-
//
|
|
491
|
-
// The color twist functionality is *partially* described in a section named
|
|
492
|
-
// "YUVToRGBColorTwist". Importantly:
|
|
493
|
-
//
|
|
494
|
-
// - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data
|
|
495
|
-
// and the color-conversion matrix as input. The function itself and the
|
|
496
|
-
// matrix assume different ranges for YUV values:
|
|
497
|
-
// - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in
|
|
498
|
-
// [-0.5, 0.5]. That's how we defined our matrix above.
|
|
499
|
-
// - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all
|
|
500
|
-
// of the input Y, U, V to be in [0, 255]. That's how the data comes out of
|
|
501
|
-
// the decoder.
|
|
502
|
-
// - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
|
|
503
|
-
// be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
|
|
504
|
-
// offset to U and V. Y doesn't need to be offset. The offset can be applied
|
|
505
|
-
// by adding a 4th column to the matrix.
|
|
506
|
-
//
|
|
507
|
-
//
|
|
508
|
-
// So our conversion matrix becomes the following, with new offset column:
|
|
509
|
-
// tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0]
|
|
510
|
-
// [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128]
|
|
511
|
-
// [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]])
|
|
512
|
-
//
|
|
513
|
-
// And that's what we need to pass for BT701, full range.
|
|
514
|
-
/* clang-format on */
|
|
365
|
+
} // namespace facebook::torchcodec
|