torchcodec 0.3.0__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/__init__.py +16 -0
- torchcodec/_core/AVIOBytesContext.cpp +70 -0
- torchcodec/_core/AVIOBytesContext.h +32 -0
- torchcodec/_core/AVIOContextHolder.cpp +50 -0
- torchcodec/_core/AVIOContextHolder.h +65 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
- torchcodec/_core/AVIOFileLikeContext.h +54 -0
- torchcodec/_core/CMakeLists.txt +237 -0
- torchcodec/_core/CudaDeviceInterface.cpp +289 -0
- torchcodec/_core/CudaDeviceInterface.h +34 -0
- torchcodec/_core/DeviceInterface.cpp +88 -0
- torchcodec/_core/DeviceInterface.h +66 -0
- torchcodec/_core/Encoder.cpp +319 -0
- torchcodec/_core/Encoder.h +39 -0
- torchcodec/_core/FFMPEGCommon.cpp +264 -0
- torchcodec/_core/FFMPEGCommon.h +180 -0
- torchcodec/_core/Frame.h +47 -0
- torchcodec/_core/Metadata.h +70 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
- torchcodec/_core/SingleStreamDecoder.h +462 -0
- torchcodec/_core/StreamOptions.h +49 -0
- torchcodec/_core/__init__.py +39 -0
- torchcodec/_core/_metadata.py +277 -0
- torchcodec/_core/custom_ops.cpp +681 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
- torchcodec/_core/ops.py +381 -0
- torchcodec/_core/pybind_ops.cpp +45 -0
- torchcodec/_frame.py +145 -0
- torchcodec/_internally_replaced_utils.py +53 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +430 -0
- torchcodec/decoders/__init__.py +11 -0
- torchcodec/decoders/_audio_decoder.py +168 -0
- torchcodec/decoders/_decoder_utils.py +52 -0
- torchcodec/decoders/_video_decoder.py +399 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_decoder4.so +0 -0
- torchcodec/libtorchcodec_decoder5.so +0 -0
- torchcodec/libtorchcodec_decoder6.so +0 -0
- torchcodec/libtorchcodec_decoder7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +285 -0
- torchcodec/samplers/_time_based.py +348 -0
- torchcodec/version.py +2 -0
- torchcodec-0.3.0.dist-info/LICENSE +28 -0
- torchcodec-0.3.0.dist-info/METADATA +280 -0
- torchcodec-0.3.0.dist-info/RECORD +57 -0
- torchcodec-0.3.0.dist-info/WHEEL +5 -0
- torchcodec-0.3.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// This source code is licensed under the BSD-style license found in the
|
|
5
|
+
// LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
#include <pybind11/pybind11.h>
|
|
8
|
+
#include <cstdint>
|
|
9
|
+
#include <sstream>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include "c10/core/SymIntArrayRef.h"
|
|
12
|
+
#include "c10/util/Exception.h"
|
|
13
|
+
#include "src/torchcodec/_core/AVIOBytesContext.h"
|
|
14
|
+
#include "src/torchcodec/_core/Encoder.h"
|
|
15
|
+
#include "src/torchcodec/_core/SingleStreamDecoder.h"
|
|
16
|
+
|
|
17
|
+
namespace facebook::torchcodec {
|
|
18
|
+
|
|
19
|
+
// ==============================
|
|
20
|
+
// Define the operators
|
|
21
|
+
// ==============================
|
|
22
|
+
// All instances of accepting the decoder as a tensor must be annotated with
|
|
23
|
+
// `Tensor(a!)`. The `(a!)` part normally indicates that the tensor is being
|
|
24
|
+
// mutated in place. We need it to make sure that torch.compile does not reorder
|
|
25
|
+
// calls to these functions. For more detail, see:
|
|
26
|
+
// https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native#readme
|
|
27
|
+
TORCH_LIBRARY(torchcodec_ns, m) {
|
|
28
|
+
m.impl_abstract_pystub(
|
|
29
|
+
"torchcodec._core.ops", "//pytorch/torchcodec:torchcodec");
|
|
30
|
+
m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
|
|
31
|
+
m.def(
|
|
32
|
+
"create_audio_encoder(Tensor wf, int sample_rate, str filename, int? bit_rate=None) -> Tensor");
|
|
33
|
+
m.def("encode_audio(Tensor(a!) encoder) -> ()");
|
|
34
|
+
m.def(
|
|
35
|
+
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
|
|
36
|
+
m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
|
|
37
|
+
m.def(
|
|
38
|
+
"_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, str? color_conversion_library=None) -> ()");
|
|
39
|
+
m.def(
|
|
40
|
+
"add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
|
|
41
|
+
m.def(
|
|
42
|
+
"add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None) -> ()");
|
|
43
|
+
m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
|
|
44
|
+
m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)");
|
|
45
|
+
m.def(
|
|
46
|
+
"get_frame_at_pts(Tensor(a!) decoder, float seconds) -> (Tensor, Tensor, Tensor)");
|
|
47
|
+
m.def(
|
|
48
|
+
"get_frame_at_index(Tensor(a!) decoder, *, int frame_index) -> (Tensor, Tensor, Tensor)");
|
|
49
|
+
m.def(
|
|
50
|
+
"get_frames_at_indices(Tensor(a!) decoder, *, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
|
|
51
|
+
m.def(
|
|
52
|
+
"get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
|
|
53
|
+
m.def(
|
|
54
|
+
"get_frames_by_pts_in_range(Tensor(a!) decoder, *, float start_seconds, float stop_seconds) -> (Tensor, Tensor, Tensor)");
|
|
55
|
+
m.def(
|
|
56
|
+
"get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
|
|
57
|
+
m.def(
|
|
58
|
+
"get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
|
|
59
|
+
m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
|
|
60
|
+
m.def("get_json_metadata(Tensor(a!) decoder) -> str");
|
|
61
|
+
m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
|
|
62
|
+
m.def(
|
|
63
|
+
"get_stream_json_metadata(Tensor(a!) decoder, int stream_index) -> str");
|
|
64
|
+
m.def("_get_json_ffmpeg_library_versions() -> str");
|
|
65
|
+
m.def(
|
|
66
|
+
"_test_frame_pts_equality(Tensor(a!) decoder, *, int frame_index, float pts_seconds_to_test) -> bool");
|
|
67
|
+
m.def("scan_all_streams_to_update_metadata(Tensor(a!) decoder) -> ()");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
namespace {
|
|
71
|
+
|
|
72
|
+
at::Tensor wrapDecoderPointerToTensor(
|
|
73
|
+
std::unique_ptr<SingleStreamDecoder> uniqueDecoder) {
|
|
74
|
+
SingleStreamDecoder* decoder = uniqueDecoder.release();
|
|
75
|
+
|
|
76
|
+
auto deleter = [decoder](void*) { delete decoder; };
|
|
77
|
+
at::Tensor tensor = at::from_blob(
|
|
78
|
+
decoder, {sizeof(SingleStreamDecoder*)}, deleter, {at::kLong});
|
|
79
|
+
auto videoDecoder =
|
|
80
|
+
static_cast<SingleStreamDecoder*>(tensor.mutable_data_ptr());
|
|
81
|
+
TORCH_CHECK_EQ(videoDecoder, decoder) << "videoDecoder=" << videoDecoder;
|
|
82
|
+
return tensor;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
SingleStreamDecoder* unwrapTensorToGetDecoder(at::Tensor& tensor) {
|
|
86
|
+
TORCH_INTERNAL_ASSERT(tensor.is_contiguous());
|
|
87
|
+
void* buffer = tensor.mutable_data_ptr();
|
|
88
|
+
SingleStreamDecoder* decoder = static_cast<SingleStreamDecoder*>(buffer);
|
|
89
|
+
return decoder;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// The elements of this tuple are all tensors that represent a single frame:
|
|
93
|
+
// 1. The frame data, which is a multidimensional tensor.
|
|
94
|
+
// 2. A single float value for the pts in seconds.
|
|
95
|
+
// 3. A single float value for the duration in seconds.
|
|
96
|
+
// The reason we use Tensors for the second and third values is so we can run
|
|
97
|
+
// under torch.compile().
|
|
98
|
+
using OpsFrameOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
|
|
99
|
+
|
|
100
|
+
OpsFrameOutput makeOpsFrameOutput(FrameOutput& frame) {
|
|
101
|
+
return std::make_tuple(
|
|
102
|
+
frame.data,
|
|
103
|
+
torch::tensor(frame.ptsSeconds, torch::dtype(torch::kFloat64)),
|
|
104
|
+
torch::tensor(frame.durationSeconds, torch::dtype(torch::kFloat64)));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// All elements of this tuple are tensors of the same leading dimension. The
|
|
108
|
+
// tuple represents the frames for N total frames, where N is the dimension of
|
|
109
|
+
// each stacked tensor. The elments are:
|
|
110
|
+
// 1. Stacked tensor of data for all N frames. Each frame is also a
|
|
111
|
+
// multidimensional tensor.
|
|
112
|
+
// 2. Tensor of N pts values in seconds, where each pts is a single
|
|
113
|
+
// float.
|
|
114
|
+
// 3. Tensor of N durationis in seconds, where each duration is a
|
|
115
|
+
// single float.
|
|
116
|
+
using OpsFrameBatchOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
|
|
117
|
+
|
|
118
|
+
OpsFrameBatchOutput makeOpsFrameBatchOutput(FrameBatchOutput& batch) {
|
|
119
|
+
return std::make_tuple(batch.data, batch.ptsSeconds, batch.durationSeconds);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// The elements of this tuple are all tensors that represent the concatenation
|
|
123
|
+
// of multiple audio frames:
|
|
124
|
+
// 1. The frames data (concatenated)
|
|
125
|
+
// 2. A single float value for the pts of the first frame, in seconds.
|
|
126
|
+
using OpsAudioFramesOutput = std::tuple<at::Tensor, at::Tensor>;
|
|
127
|
+
|
|
128
|
+
OpsAudioFramesOutput makeOpsAudioFramesOutput(AudioFramesOutput& audioFrames) {
|
|
129
|
+
return std::make_tuple(
|
|
130
|
+
audioFrames.data,
|
|
131
|
+
torch::tensor(audioFrames.ptsSeconds, torch::dtype(torch::kFloat64)));
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
std::string quoteValue(const std::string& value) {
|
|
135
|
+
return "\"" + value + "\"";
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
|
|
139
|
+
std::stringstream ss;
|
|
140
|
+
ss << "{\n";
|
|
141
|
+
auto it = metadataMap.begin();
|
|
142
|
+
while (it != metadataMap.end()) {
|
|
143
|
+
ss << "\"" << it->first << "\": " << it->second;
|
|
144
|
+
++it;
|
|
145
|
+
if (it != metadataMap.end()) {
|
|
146
|
+
ss << ",\n";
|
|
147
|
+
} else {
|
|
148
|
+
ss << "\n";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
ss << "}";
|
|
152
|
+
|
|
153
|
+
return ss.str();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
} // namespace
|
|
157
|
+
|
|
158
|
+
// ==============================
|
|
159
|
+
// Implementations for the operators
|
|
160
|
+
// ==============================
|
|
161
|
+
|
|
162
|
+
// Create a SingleStreamDecoder from file and wrap the pointer in a tensor.
|
|
163
|
+
at::Tensor create_from_file(
|
|
164
|
+
std::string_view filename,
|
|
165
|
+
std::optional<std::string_view> seek_mode = std::nullopt) {
|
|
166
|
+
std::string filenameStr(filename);
|
|
167
|
+
|
|
168
|
+
SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
|
|
169
|
+
if (seek_mode.has_value()) {
|
|
170
|
+
realSeek = seekModeFromString(seek_mode.value());
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
|
|
174
|
+
std::make_unique<SingleStreamDecoder>(filenameStr, realSeek);
|
|
175
|
+
|
|
176
|
+
return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Create a SingleStreamDecoder from the actual bytes of a video and wrap the
|
|
180
|
+
// pointer in a tensor. The SingleStreamDecoder will decode the provided bytes.
|
|
181
|
+
at::Tensor create_from_tensor(
|
|
182
|
+
at::Tensor video_tensor,
|
|
183
|
+
std::optional<std::string_view> seek_mode = std::nullopt) {
|
|
184
|
+
TORCH_CHECK(video_tensor.is_contiguous(), "video_tensor must be contiguous");
|
|
185
|
+
TORCH_CHECK(
|
|
186
|
+
video_tensor.scalar_type() == torch::kUInt8,
|
|
187
|
+
"video_tensor must be kUInt8");
|
|
188
|
+
void* data = video_tensor.mutable_data_ptr();
|
|
189
|
+
size_t length = video_tensor.numel();
|
|
190
|
+
|
|
191
|
+
SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
|
|
192
|
+
if (seek_mode.has_value()) {
|
|
193
|
+
realSeek = seekModeFromString(seek_mode.value());
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
auto contextHolder = std::make_unique<AVIOBytesContext>(data, length);
|
|
197
|
+
|
|
198
|
+
std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
|
|
199
|
+
std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
|
|
200
|
+
return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
|
|
204
|
+
auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
|
|
205
|
+
std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
|
|
206
|
+
return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
void _add_video_stream(
|
|
210
|
+
at::Tensor& decoder,
|
|
211
|
+
std::optional<int64_t> width = std::nullopt,
|
|
212
|
+
std::optional<int64_t> height = std::nullopt,
|
|
213
|
+
std::optional<int64_t> num_threads = std::nullopt,
|
|
214
|
+
std::optional<std::string_view> dimension_order = std::nullopt,
|
|
215
|
+
std::optional<int64_t> stream_index = std::nullopt,
|
|
216
|
+
std::optional<std::string_view> device = std::nullopt,
|
|
217
|
+
std::optional<std::string_view> color_conversion_library = std::nullopt) {
|
|
218
|
+
VideoStreamOptions videoStreamOptions;
|
|
219
|
+
videoStreamOptions.width = width;
|
|
220
|
+
videoStreamOptions.height = height;
|
|
221
|
+
videoStreamOptions.ffmpegThreadCount = num_threads;
|
|
222
|
+
|
|
223
|
+
if (dimension_order.has_value()) {
|
|
224
|
+
std::string stdDimensionOrder{dimension_order.value()};
|
|
225
|
+
TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
|
|
226
|
+
videoStreamOptions.dimensionOrder = stdDimensionOrder;
|
|
227
|
+
}
|
|
228
|
+
if (color_conversion_library.has_value()) {
|
|
229
|
+
std::string stdColorConversionLibrary{color_conversion_library.value()};
|
|
230
|
+
if (stdColorConversionLibrary == "filtergraph") {
|
|
231
|
+
videoStreamOptions.colorConversionLibrary =
|
|
232
|
+
ColorConversionLibrary::FILTERGRAPH;
|
|
233
|
+
} else if (stdColorConversionLibrary == "swscale") {
|
|
234
|
+
videoStreamOptions.colorConversionLibrary =
|
|
235
|
+
ColorConversionLibrary::SWSCALE;
|
|
236
|
+
} else {
|
|
237
|
+
throw std::runtime_error(
|
|
238
|
+
"Invalid color_conversion_library=" + stdColorConversionLibrary +
|
|
239
|
+
". color_conversion_library must be either filtergraph or swscale.");
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (device.has_value()) {
|
|
243
|
+
videoStreamOptions.device = createTorchDevice(std::string(device.value()));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
247
|
+
videoDecoder->addVideoStream(stream_index.value_or(-1), videoStreamOptions);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Add a new video stream at `stream_index` using the provided options.
|
|
251
|
+
void add_video_stream(
|
|
252
|
+
at::Tensor& decoder,
|
|
253
|
+
std::optional<int64_t> width = std::nullopt,
|
|
254
|
+
std::optional<int64_t> height = std::nullopt,
|
|
255
|
+
std::optional<int64_t> num_threads = std::nullopt,
|
|
256
|
+
std::optional<std::string_view> dimension_order = std::nullopt,
|
|
257
|
+
std::optional<int64_t> stream_index = std::nullopt,
|
|
258
|
+
std::optional<std::string_view> device = std::nullopt) {
|
|
259
|
+
_add_video_stream(
|
|
260
|
+
decoder,
|
|
261
|
+
width,
|
|
262
|
+
height,
|
|
263
|
+
num_threads,
|
|
264
|
+
dimension_order,
|
|
265
|
+
stream_index,
|
|
266
|
+
device);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
void add_audio_stream(
|
|
270
|
+
at::Tensor& decoder,
|
|
271
|
+
std::optional<int64_t> stream_index = std::nullopt,
|
|
272
|
+
std::optional<int64_t> sample_rate = std::nullopt) {
|
|
273
|
+
AudioStreamOptions audioStreamOptions;
|
|
274
|
+
audioStreamOptions.sampleRate = sample_rate;
|
|
275
|
+
|
|
276
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
277
|
+
videoDecoder->addAudioStream(stream_index.value_or(-1), audioStreamOptions);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Seek to a particular presentation timestamp in the video in seconds.
|
|
281
|
+
void seek_to_pts(at::Tensor& decoder, double seconds) {
|
|
282
|
+
auto videoDecoder =
|
|
283
|
+
static_cast<SingleStreamDecoder*>(decoder.mutable_data_ptr());
|
|
284
|
+
videoDecoder->setCursorPtsInSeconds(seconds);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Get the next frame from the video as a tuple that has the frame data, pts and
|
|
288
|
+
// duration as tensors.
|
|
289
|
+
OpsFrameOutput get_next_frame(at::Tensor& decoder) {
|
|
290
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
291
|
+
FrameOutput result;
|
|
292
|
+
try {
|
|
293
|
+
result = videoDecoder->getNextFrame();
|
|
294
|
+
} catch (const SingleStreamDecoder::EndOfFileException& e) {
|
|
295
|
+
C10_THROW_ERROR(IndexError, e.what());
|
|
296
|
+
}
|
|
297
|
+
return makeOpsFrameOutput(result);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Return the frame that is visible at a given timestamp in seconds. Each frame
|
|
301
|
+
// in FFMPEG has a presentation timestamp and a duration. The frame visible at a
|
|
302
|
+
// given timestamp T has T >= PTS and T < PTS + Duration.
|
|
303
|
+
OpsFrameOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
|
|
304
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
305
|
+
FrameOutput result;
|
|
306
|
+
try {
|
|
307
|
+
result = videoDecoder->getFramePlayedAt(seconds);
|
|
308
|
+
} catch (const SingleStreamDecoder::EndOfFileException& e) {
|
|
309
|
+
C10_THROW_ERROR(IndexError, e.what());
|
|
310
|
+
}
|
|
311
|
+
return makeOpsFrameOutput(result);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Return the frame that is visible at a given index in the video.
|
|
315
|
+
OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index) {
|
|
316
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
317
|
+
auto result = videoDecoder->getFrameAtIndex(frame_index);
|
|
318
|
+
return makeOpsFrameOutput(result);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Return the frames at given indices for a given stream
|
|
322
|
+
OpsFrameBatchOutput get_frames_at_indices(
|
|
323
|
+
at::Tensor& decoder,
|
|
324
|
+
at::IntArrayRef frame_indices) {
|
|
325
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
326
|
+
std::vector<int64_t> frameIndicesVec(
|
|
327
|
+
frame_indices.begin(), frame_indices.end());
|
|
328
|
+
auto result = videoDecoder->getFramesAtIndices(frameIndicesVec);
|
|
329
|
+
return makeOpsFrameBatchOutput(result);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Return the frames inside a range as a single stacked Tensor. The range is
|
|
333
|
+
// defined as [start, stop).
|
|
334
|
+
OpsFrameBatchOutput get_frames_in_range(
|
|
335
|
+
at::Tensor& decoder,
|
|
336
|
+
int64_t start,
|
|
337
|
+
int64_t stop,
|
|
338
|
+
std::optional<int64_t> step = std::nullopt) {
|
|
339
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
340
|
+
auto result = videoDecoder->getFramesInRange(start, stop, step.value_or(1));
|
|
341
|
+
return makeOpsFrameBatchOutput(result);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Return the frames at given ptss for a given stream
|
|
345
|
+
OpsFrameBatchOutput get_frames_by_pts(
|
|
346
|
+
at::Tensor& decoder,
|
|
347
|
+
at::ArrayRef<double> timestamps) {
|
|
348
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
349
|
+
std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
|
|
350
|
+
auto result = videoDecoder->getFramesPlayedAt(timestampsVec);
|
|
351
|
+
return makeOpsFrameBatchOutput(result);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Return the frames inside the range as a single stacked Tensor. The range is
|
|
355
|
+
// defined as [start_seconds, stop_seconds). The frames are stacked in pts
|
|
356
|
+
// order.
|
|
357
|
+
OpsFrameBatchOutput get_frames_by_pts_in_range(
|
|
358
|
+
at::Tensor& decoder,
|
|
359
|
+
double start_seconds,
|
|
360
|
+
double stop_seconds) {
|
|
361
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
362
|
+
auto result =
|
|
363
|
+
videoDecoder->getFramesPlayedInRange(start_seconds, stop_seconds);
|
|
364
|
+
return makeOpsFrameBatchOutput(result);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
|
|
368
|
+
at::Tensor& decoder,
|
|
369
|
+
double start_seconds,
|
|
370
|
+
std::optional<double> stop_seconds = std::nullopt) {
|
|
371
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
372
|
+
auto result =
|
|
373
|
+
videoDecoder->getFramesPlayedInRangeAudio(start_seconds, stop_seconds);
|
|
374
|
+
return makeOpsAudioFramesOutput(result);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
at::Tensor wrapAudioEncoderPointerToTensor(
|
|
378
|
+
std::unique_ptr<AudioEncoder> uniqueAudioEncoder) {
|
|
379
|
+
AudioEncoder* encoder = uniqueAudioEncoder.release();
|
|
380
|
+
|
|
381
|
+
auto deleter = [encoder](void*) { delete encoder; };
|
|
382
|
+
at::Tensor tensor =
|
|
383
|
+
at::from_blob(encoder, {sizeof(AudioEncoder*)}, deleter, {at::kLong});
|
|
384
|
+
auto encoder_ = static_cast<AudioEncoder*>(tensor.mutable_data_ptr());
|
|
385
|
+
TORCH_CHECK_EQ(encoder_, encoder) << "AudioEncoder=" << encoder_;
|
|
386
|
+
return tensor;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
AudioEncoder* unwrapTensorToGetAudioEncoder(at::Tensor& tensor) {
|
|
390
|
+
TORCH_INTERNAL_ASSERT(tensor.is_contiguous());
|
|
391
|
+
void* buffer = tensor.mutable_data_ptr();
|
|
392
|
+
AudioEncoder* encoder = static_cast<AudioEncoder*>(buffer);
|
|
393
|
+
return encoder;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
at::Tensor create_audio_encoder(
|
|
397
|
+
const at::Tensor wf,
|
|
398
|
+
int64_t sample_rate,
|
|
399
|
+
std::string_view file_name,
|
|
400
|
+
std::optional<int64_t> bit_rate = std::nullopt) {
|
|
401
|
+
TORCH_CHECK(
|
|
402
|
+
sample_rate <= std::numeric_limits<int>::max(),
|
|
403
|
+
"sample_rate=",
|
|
404
|
+
sample_rate,
|
|
405
|
+
" is too large to be cast to an int.");
|
|
406
|
+
std::unique_ptr<AudioEncoder> uniqueAudioEncoder =
|
|
407
|
+
std::make_unique<AudioEncoder>(
|
|
408
|
+
wf, static_cast<int>(sample_rate), file_name, bit_rate);
|
|
409
|
+
return wrapAudioEncoderPointerToTensor(std::move(uniqueAudioEncoder));
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
void encode_audio(at::Tensor& encoder) {
|
|
413
|
+
auto encoder_ = unwrapTensorToGetAudioEncoder(encoder);
|
|
414
|
+
encoder_->encode();
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// For testing only. We need to implement this operation as a core library
|
|
418
|
+
// function because what we're testing is round-tripping pts values as
|
|
419
|
+
// double-precision floating point numbers from C++ to Python and back to C++.
|
|
420
|
+
// We want to make sure that the value is preserved exactly, bit-for-bit, during
|
|
421
|
+
// this process.
|
|
422
|
+
//
|
|
423
|
+
// Returns true if for the given decoder, the pts
|
|
424
|
+
// value when converted to seconds as a double is exactly pts_seconds_to_test.
|
|
425
|
+
// Returns false otherwise.
|
|
426
|
+
bool _test_frame_pts_equality(
|
|
427
|
+
at::Tensor& decoder,
|
|
428
|
+
int64_t frame_index,
|
|
429
|
+
double pts_seconds_to_test) {
|
|
430
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
431
|
+
return pts_seconds_to_test ==
|
|
432
|
+
videoDecoder->getPtsSecondsForFrame(frame_index);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
torch::Tensor _get_key_frame_indices(at::Tensor& decoder) {
|
|
436
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
437
|
+
return videoDecoder->getKeyFrameIndices();
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Get the metadata from the video as a string.
|
|
441
|
+
std::string get_json_metadata(at::Tensor& decoder) {
|
|
442
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
443
|
+
|
|
444
|
+
ContainerMetadata videoMetadata = videoDecoder->getContainerMetadata();
|
|
445
|
+
auto maybeBestVideoStreamIndex = videoMetadata.bestVideoStreamIndex;
|
|
446
|
+
|
|
447
|
+
std::map<std::string, std::string> metadataMap;
|
|
448
|
+
// serialize the metadata into a string std::stringstream ss;
|
|
449
|
+
double durationSeconds = 0;
|
|
450
|
+
if (maybeBestVideoStreamIndex.has_value() &&
|
|
451
|
+
videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex]
|
|
452
|
+
.durationSeconds.has_value()) {
|
|
453
|
+
durationSeconds =
|
|
454
|
+
videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex]
|
|
455
|
+
.durationSeconds.value_or(0);
|
|
456
|
+
} else {
|
|
457
|
+
// Fallback to container-level duration if stream duration is not found.
|
|
458
|
+
durationSeconds = videoMetadata.durationSeconds.value_or(0);
|
|
459
|
+
}
|
|
460
|
+
metadataMap["durationSeconds"] = std::to_string(durationSeconds);
|
|
461
|
+
|
|
462
|
+
if (videoMetadata.bitRate.has_value()) {
|
|
463
|
+
metadataMap["bitRate"] = std::to_string(videoMetadata.bitRate.value());
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
if (maybeBestVideoStreamIndex.has_value()) {
|
|
467
|
+
auto streamMetadata =
|
|
468
|
+
videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex];
|
|
469
|
+
if (streamMetadata.numFramesFromScan.has_value()) {
|
|
470
|
+
metadataMap["numFrames"] =
|
|
471
|
+
std::to_string(*streamMetadata.numFramesFromScan);
|
|
472
|
+
} else if (streamMetadata.numFrames.has_value()) {
|
|
473
|
+
metadataMap["numFrames"] = std::to_string(*streamMetadata.numFrames);
|
|
474
|
+
}
|
|
475
|
+
if (streamMetadata.minPtsSecondsFromScan.has_value()) {
|
|
476
|
+
metadataMap["minPtsSecondsFromScan"] =
|
|
477
|
+
std::to_string(*streamMetadata.minPtsSecondsFromScan);
|
|
478
|
+
}
|
|
479
|
+
if (streamMetadata.maxPtsSecondsFromScan.has_value()) {
|
|
480
|
+
metadataMap["maxPtsSecondsFromScan"] =
|
|
481
|
+
std::to_string(*streamMetadata.maxPtsSecondsFromScan);
|
|
482
|
+
}
|
|
483
|
+
if (streamMetadata.codecName.has_value()) {
|
|
484
|
+
metadataMap["codec"] = quoteValue(streamMetadata.codecName.value());
|
|
485
|
+
}
|
|
486
|
+
if (streamMetadata.width.has_value()) {
|
|
487
|
+
metadataMap["width"] = std::to_string(*streamMetadata.width);
|
|
488
|
+
}
|
|
489
|
+
if (streamMetadata.height.has_value()) {
|
|
490
|
+
metadataMap["height"] = std::to_string(*streamMetadata.height);
|
|
491
|
+
}
|
|
492
|
+
if (streamMetadata.averageFps.has_value()) {
|
|
493
|
+
metadataMap["averageFps"] = std::to_string(*streamMetadata.averageFps);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (videoMetadata.bestVideoStreamIndex.has_value()) {
|
|
497
|
+
metadataMap["bestVideoStreamIndex"] =
|
|
498
|
+
std::to_string(*videoMetadata.bestVideoStreamIndex);
|
|
499
|
+
}
|
|
500
|
+
if (videoMetadata.bestAudioStreamIndex.has_value()) {
|
|
501
|
+
metadataMap["bestAudioStreamIndex"] =
|
|
502
|
+
std::to_string(*videoMetadata.bestAudioStreamIndex);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
return mapToJson(metadataMap);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Get the container metadata as a string.
|
|
509
|
+
std::string get_container_json_metadata(at::Tensor& decoder) {
|
|
510
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
511
|
+
|
|
512
|
+
auto containerMetadata = videoDecoder->getContainerMetadata();
|
|
513
|
+
|
|
514
|
+
std::map<std::string, std::string> map;
|
|
515
|
+
|
|
516
|
+
if (containerMetadata.durationSeconds.has_value()) {
|
|
517
|
+
map["durationSeconds"] = std::to_string(*containerMetadata.durationSeconds);
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
if (containerMetadata.bitRate.has_value()) {
|
|
521
|
+
map["bitRate"] = std::to_string(*containerMetadata.bitRate);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if (containerMetadata.bestVideoStreamIndex.has_value()) {
|
|
525
|
+
map["bestVideoStreamIndex"] =
|
|
526
|
+
std::to_string(*containerMetadata.bestVideoStreamIndex);
|
|
527
|
+
}
|
|
528
|
+
if (containerMetadata.bestAudioStreamIndex.has_value()) {
|
|
529
|
+
map["bestAudioStreamIndex"] =
|
|
530
|
+
std::to_string(*containerMetadata.bestAudioStreamIndex);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
map["numStreams"] =
|
|
534
|
+
std::to_string(containerMetadata.allStreamMetadata.size());
|
|
535
|
+
|
|
536
|
+
return mapToJson(map);
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Get the stream metadata as a string.
|
|
540
|
+
std::string get_stream_json_metadata(
|
|
541
|
+
at::Tensor& decoder,
|
|
542
|
+
int64_t stream_index) {
|
|
543
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
544
|
+
auto allStreamMetadata =
|
|
545
|
+
videoDecoder->getContainerMetadata().allStreamMetadata;
|
|
546
|
+
if (stream_index < 0 ||
|
|
547
|
+
stream_index >= static_cast<int64_t>(allStreamMetadata.size())) {
|
|
548
|
+
throw std::out_of_range(
|
|
549
|
+
"stream_index out of bounds: " + std::to_string(stream_index));
|
|
550
|
+
}
|
|
551
|
+
auto streamMetadata = allStreamMetadata[stream_index];
|
|
552
|
+
|
|
553
|
+
std::map<std::string, std::string> map;
|
|
554
|
+
|
|
555
|
+
if (streamMetadata.durationSeconds.has_value()) {
|
|
556
|
+
map["durationSeconds"] = std::to_string(*streamMetadata.durationSeconds);
|
|
557
|
+
}
|
|
558
|
+
if (streamMetadata.bitRate.has_value()) {
|
|
559
|
+
map["bitRate"] = std::to_string(*streamMetadata.bitRate);
|
|
560
|
+
}
|
|
561
|
+
if (streamMetadata.numFramesFromScan.has_value()) {
|
|
562
|
+
map["numFramesFromScan"] =
|
|
563
|
+
std::to_string(*streamMetadata.numFramesFromScan);
|
|
564
|
+
}
|
|
565
|
+
if (streamMetadata.numFrames.has_value()) {
|
|
566
|
+
map["numFrames"] = std::to_string(*streamMetadata.numFrames);
|
|
567
|
+
}
|
|
568
|
+
if (streamMetadata.beginStreamFromHeader.has_value()) {
|
|
569
|
+
map["beginStreamFromHeader"] =
|
|
570
|
+
std::to_string(*streamMetadata.beginStreamFromHeader);
|
|
571
|
+
}
|
|
572
|
+
if (streamMetadata.minPtsSecondsFromScan.has_value()) {
|
|
573
|
+
map["minPtsSecondsFromScan"] =
|
|
574
|
+
std::to_string(*streamMetadata.minPtsSecondsFromScan);
|
|
575
|
+
}
|
|
576
|
+
if (streamMetadata.maxPtsSecondsFromScan.has_value()) {
|
|
577
|
+
map["maxPtsSecondsFromScan"] =
|
|
578
|
+
std::to_string(*streamMetadata.maxPtsSecondsFromScan);
|
|
579
|
+
}
|
|
580
|
+
if (streamMetadata.codecName.has_value()) {
|
|
581
|
+
map["codec"] = quoteValue(streamMetadata.codecName.value());
|
|
582
|
+
}
|
|
583
|
+
if (streamMetadata.width.has_value()) {
|
|
584
|
+
map["width"] = std::to_string(*streamMetadata.width);
|
|
585
|
+
}
|
|
586
|
+
if (streamMetadata.height.has_value()) {
|
|
587
|
+
map["height"] = std::to_string(*streamMetadata.height);
|
|
588
|
+
}
|
|
589
|
+
if (streamMetadata.averageFps.has_value()) {
|
|
590
|
+
map["averageFps"] = std::to_string(*streamMetadata.averageFps);
|
|
591
|
+
}
|
|
592
|
+
if (streamMetadata.sampleRate.has_value()) {
|
|
593
|
+
map["sampleRate"] = std::to_string(*streamMetadata.sampleRate);
|
|
594
|
+
}
|
|
595
|
+
if (streamMetadata.numChannels.has_value()) {
|
|
596
|
+
map["numChannels"] = std::to_string(*streamMetadata.numChannels);
|
|
597
|
+
}
|
|
598
|
+
if (streamMetadata.sampleFormat.has_value()) {
|
|
599
|
+
map["sampleFormat"] = quoteValue(streamMetadata.sampleFormat.value());
|
|
600
|
+
}
|
|
601
|
+
if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {
|
|
602
|
+
map["mediaType"] = quoteValue("video");
|
|
603
|
+
} else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {
|
|
604
|
+
map["mediaType"] = quoteValue("audio");
|
|
605
|
+
} else {
|
|
606
|
+
map["mediaType"] = quoteValue("other");
|
|
607
|
+
}
|
|
608
|
+
return mapToJson(map);
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// Returns version information about the various FFMPEG libraries that are
|
|
612
|
+
// loaded in the program's address space.
|
|
613
|
+
std::string _get_json_ffmpeg_library_versions() {
|
|
614
|
+
std::stringstream ss;
|
|
615
|
+
ss << "{\n";
|
|
616
|
+
|
|
617
|
+
unsigned int version = avfilter_version();
|
|
618
|
+
ss << "\"libavfilter\": [" << AV_VERSION_MAJOR(version) << ", "
|
|
619
|
+
<< AV_VERSION_MINOR(version) << ", " << AV_VERSION_MICRO(version)
|
|
620
|
+
<< "],\n";
|
|
621
|
+
version = avutil_version();
|
|
622
|
+
ss << "\"libavutil\": [" << AV_VERSION_MAJOR(version) << ", "
|
|
623
|
+
<< AV_VERSION_MINOR(version) << ", " << AV_VERSION_MICRO(version)
|
|
624
|
+
<< "],\n";
|
|
625
|
+
version = avcodec_version();
|
|
626
|
+
ss << "\"libavcodec\": [" << AV_VERSION_MAJOR(version) << ", "
|
|
627
|
+
<< AV_VERSION_MINOR(version) << ", " << AV_VERSION_MICRO(version)
|
|
628
|
+
<< "],\n";
|
|
629
|
+
version = avformat_version();
|
|
630
|
+
ss << "\"libavformat\": [" << AV_VERSION_MAJOR(version) << ", "
|
|
631
|
+
<< AV_VERSION_MINOR(version) << ", " << AV_VERSION_MICRO(version)
|
|
632
|
+
<< "],\n";
|
|
633
|
+
ss << "\"ffmpeg_version\": \"" << av_version_info() << "\"\n";
|
|
634
|
+
ss << "}\n";
|
|
635
|
+
|
|
636
|
+
return ss.str();
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// Scans video packets to get more accurate metadata like frame count, exact
|
|
640
|
+
// keyframe positions, etc. Exact keyframe positions are useful for efficient
|
|
641
|
+
// accurate seeking. Note that this function reads the entire video but it does
|
|
642
|
+
// not decode frames. Reading a video file is much cheaper than decoding it.
|
|
643
|
+
void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
|
|
644
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
645
|
+
videoDecoder->scanFileAndUpdateMetadataAndIndex();
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
|
|
649
|
+
m.impl("create_from_file", &create_from_file);
|
|
650
|
+
m.impl("create_audio_encoder", &create_audio_encoder);
|
|
651
|
+
m.impl("create_from_tensor", &create_from_tensor);
|
|
652
|
+
m.impl("_convert_to_tensor", &_convert_to_tensor);
|
|
653
|
+
m.impl(
|
|
654
|
+
"_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
|
|
658
|
+
m.impl("encode_audio", &encode_audio);
|
|
659
|
+
m.impl("seek_to_pts", &seek_to_pts);
|
|
660
|
+
m.impl("add_video_stream", &add_video_stream);
|
|
661
|
+
m.impl("_add_video_stream", &_add_video_stream);
|
|
662
|
+
m.impl("add_audio_stream", &add_audio_stream);
|
|
663
|
+
m.impl("get_next_frame", &get_next_frame);
|
|
664
|
+
m.impl("_get_key_frame_indices", &_get_key_frame_indices);
|
|
665
|
+
m.impl("get_json_metadata", &get_json_metadata);
|
|
666
|
+
m.impl("get_container_json_metadata", &get_container_json_metadata);
|
|
667
|
+
m.impl("get_stream_json_metadata", &get_stream_json_metadata);
|
|
668
|
+
m.impl("get_frame_at_pts", &get_frame_at_pts);
|
|
669
|
+
m.impl("get_frame_at_index", &get_frame_at_index);
|
|
670
|
+
m.impl("get_frames_at_indices", &get_frames_at_indices);
|
|
671
|
+
m.impl("get_frames_in_range", &get_frames_in_range);
|
|
672
|
+
m.impl("get_frames_by_pts_in_range", &get_frames_by_pts_in_range);
|
|
673
|
+
m.impl("get_frames_by_pts_in_range_audio", &get_frames_by_pts_in_range_audio);
|
|
674
|
+
m.impl("get_frames_by_pts", &get_frames_by_pts);
|
|
675
|
+
m.impl("_test_frame_pts_equality", &_test_frame_pts_equality);
|
|
676
|
+
m.impl(
|
|
677
|
+
"scan_all_streams_to_update_metadata",
|
|
678
|
+
&scan_all_streams_to_update_metadata);
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
} // namespace facebook::torchcodec
|