torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.1__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/AVIOTensorContext.cpp +23 -16
- torchcodec/_core/AVIOTensorContext.h +2 -1
- torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
- torchcodec/_core/CMakeLists.txt +18 -3
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +6 -20
- torchcodec/_core/CpuDeviceInterface.cpp +195 -108
- torchcodec/_core/CpuDeviceInterface.h +84 -19
- torchcodec/_core/CudaDeviceInterface.cpp +227 -376
- torchcodec/_core/CudaDeviceInterface.h +38 -6
- torchcodec/_core/DeviceInterface.cpp +57 -19
- torchcodec/_core/DeviceInterface.h +97 -16
- torchcodec/_core/Encoder.cpp +346 -9
- torchcodec/_core/Encoder.h +62 -1
- torchcodec/_core/FFMPEGCommon.cpp +190 -3
- torchcodec/_core/FFMPEGCommon.h +27 -1
- torchcodec/_core/FilterGraph.cpp +30 -22
- torchcodec/_core/FilterGraph.h +15 -1
- torchcodec/_core/Frame.cpp +22 -7
- torchcodec/_core/Frame.h +15 -61
- torchcodec/_core/Metadata.h +2 -2
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +196 -201
- torchcodec/_core/SingleStreamDecoder.h +42 -15
- torchcodec/_core/StreamOptions.h +16 -6
- torchcodec/_core/Transform.cpp +87 -0
- torchcodec/_core/Transform.h +84 -0
- torchcodec/_core/__init__.py +4 -0
- torchcodec/_core/custom_ops.cpp +257 -32
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +147 -44
- torchcodec/_core/pybind_ops.cpp +22 -59
- torchcodec/_samplers/video_clip_sampler.py +7 -19
- torchcodec/decoders/__init__.py +1 -0
- torchcodec/decoders/_decoder_utils.py +61 -1
- torchcodec/decoders/_video_decoder.py +46 -20
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/samplers/_time_based.py +8 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
- torchcodec-0.8.1.dist-info/RECORD +82 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
- torchcodec-0.7.0.dist-info/RECORD +0 -67
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
torchcodec/_core/custom_ops.cpp
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <string>
|
|
11
11
|
#include "c10/core/SymIntArrayRef.h"
|
|
12
12
|
#include "c10/util/Exception.h"
|
|
13
|
+
#include "src/torchcodec/_core/AVIOFileLikeContext.h"
|
|
13
14
|
#include "src/torchcodec/_core/AVIOTensorContext.h"
|
|
14
15
|
#include "src/torchcodec/_core/Encoder.h"
|
|
15
16
|
#include "src/torchcodec/_core/SingleStreamDecoder.h"
|
|
@@ -33,13 +34,22 @@ TORCH_LIBRARY(torchcodec_ns, m) {
|
|
|
33
34
|
"encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
|
|
34
35
|
m.def(
|
|
35
36
|
"encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
|
|
37
|
+
m.def(
|
|
38
|
+
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
|
|
39
|
+
m.def(
|
|
40
|
+
"encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
|
|
41
|
+
m.def(
|
|
42
|
+
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, int? crf=None) -> Tensor");
|
|
43
|
+
m.def(
|
|
44
|
+
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, int? crf=None) -> ()");
|
|
36
45
|
m.def(
|
|
37
46
|
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
|
|
38
|
-
m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
|
|
39
47
|
m.def(
|
|
40
|
-
"
|
|
48
|
+
"_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
|
|
49
|
+
m.def(
|
|
50
|
+
"_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"ffmpeg\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
|
|
41
51
|
m.def(
|
|
42
|
-
"add_video_stream(Tensor(a!) decoder, *, int?
|
|
52
|
+
"add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"ffmpeg\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
|
|
43
53
|
m.def(
|
|
44
54
|
"add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
|
|
45
55
|
m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
|
|
@@ -49,7 +59,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
|
|
|
49
59
|
m.def(
|
|
50
60
|
"get_frame_at_index(Tensor(a!) decoder, *, int frame_index) -> (Tensor, Tensor, Tensor)");
|
|
51
61
|
m.def(
|
|
52
|
-
"get_frames_at_indices(Tensor(a!) decoder, *,
|
|
62
|
+
"get_frames_at_indices(Tensor(a!) decoder, *, Tensor frame_indices) -> (Tensor, Tensor, Tensor)");
|
|
53
63
|
m.def(
|
|
54
64
|
"get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
|
|
55
65
|
m.def(
|
|
@@ -57,13 +67,14 @@ TORCH_LIBRARY(torchcodec_ns, m) {
|
|
|
57
67
|
m.def(
|
|
58
68
|
"get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
|
|
59
69
|
m.def(
|
|
60
|
-
"get_frames_by_pts(Tensor(a!) decoder, *,
|
|
70
|
+
"get_frames_by_pts(Tensor(a!) decoder, *, Tensor timestamps) -> (Tensor, Tensor, Tensor)");
|
|
61
71
|
m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
|
|
62
72
|
m.def("get_json_metadata(Tensor(a!) decoder) -> str");
|
|
63
73
|
m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
|
|
64
74
|
m.def(
|
|
65
75
|
"get_stream_json_metadata(Tensor(a!) decoder, int stream_index) -> str");
|
|
66
76
|
m.def("_get_json_ffmpeg_library_versions() -> str");
|
|
77
|
+
m.def("_get_backend_details(Tensor(a!) decoder) -> str");
|
|
67
78
|
m.def(
|
|
68
79
|
"_test_frame_pts_equality(Tensor(a!) decoder, *, int frame_index, float pts_seconds_to_test) -> bool");
|
|
69
80
|
m.def("scan_all_streams_to_update_metadata(Tensor(a!) decoder) -> ()");
|
|
@@ -165,6 +176,103 @@ std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
|
|
|
165
176
|
return ss.str();
|
|
166
177
|
}
|
|
167
178
|
|
|
179
|
+
SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
|
|
180
|
+
if (seekMode == "exact") {
|
|
181
|
+
return SingleStreamDecoder::SeekMode::exact;
|
|
182
|
+
} else if (seekMode == "approximate") {
|
|
183
|
+
return SingleStreamDecoder::SeekMode::approximate;
|
|
184
|
+
} else if (seekMode == "custom_frame_mappings") {
|
|
185
|
+
return SingleStreamDecoder::SeekMode::custom_frame_mappings;
|
|
186
|
+
} else {
|
|
187
|
+
TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
int checkedToPositiveInt(const std::string& str) {
|
|
192
|
+
int ret = 0;
|
|
193
|
+
try {
|
|
194
|
+
ret = std::stoi(str);
|
|
195
|
+
} catch (const std::invalid_argument&) {
|
|
196
|
+
TORCH_CHECK(false, "String cannot be converted to an int:" + str);
|
|
197
|
+
} catch (const std::out_of_range&) {
|
|
198
|
+
TORCH_CHECK(false, "String would become integer out of range:" + str);
|
|
199
|
+
}
|
|
200
|
+
TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
|
|
201
|
+
return ret;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Resize transform specs take the form:
|
|
205
|
+
//
|
|
206
|
+
// "resize, <height>, <width>"
|
|
207
|
+
//
|
|
208
|
+
// Where "resize" is the string literal and <height> and <width> are positive
|
|
209
|
+
// integers.
|
|
210
|
+
Transform* makeResizeTransform(
|
|
211
|
+
const std::vector<std::string>& resizeTransformSpec) {
|
|
212
|
+
TORCH_CHECK(
|
|
213
|
+
resizeTransformSpec.size() == 3,
|
|
214
|
+
"resizeTransformSpec must have 3 elements including its name");
|
|
215
|
+
int height = checkedToPositiveInt(resizeTransformSpec[1]);
|
|
216
|
+
int width = checkedToPositiveInt(resizeTransformSpec[2]);
|
|
217
|
+
return new ResizeTransform(FrameDims(height, width));
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Crop transform specs take the form:
|
|
221
|
+
//
|
|
222
|
+
// "crop, <height>, <width>, <x>, <y>"
|
|
223
|
+
//
|
|
224
|
+
// Where "crop" is the string literal and <height>, <width>, <x> and <y> are
|
|
225
|
+
// positive integers. <x> and <y> are the x and y coordinates of the top left
|
|
226
|
+
// corner of the crop. Note that we follow the PyTorch convention of (height,
|
|
227
|
+
// width) for specifying image dimensions; FFmpeg uses (width, height).
|
|
228
|
+
Transform* makeCropTransform(
|
|
229
|
+
const std::vector<std::string>& cropTransformSpec) {
|
|
230
|
+
TORCH_CHECK(
|
|
231
|
+
cropTransformSpec.size() == 5,
|
|
232
|
+
"cropTransformSpec must have 5 elements including its name");
|
|
233
|
+
int height = checkedToPositiveInt(cropTransformSpec[1]);
|
|
234
|
+
int width = checkedToPositiveInt(cropTransformSpec[2]);
|
|
235
|
+
int x = checkedToPositiveInt(cropTransformSpec[3]);
|
|
236
|
+
int y = checkedToPositiveInt(cropTransformSpec[4]);
|
|
237
|
+
return new CropTransform(FrameDims(height, width), x, y);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
std::vector<std::string> split(const std::string& str, char delimiter) {
|
|
241
|
+
std::vector<std::string> tokens;
|
|
242
|
+
std::string token;
|
|
243
|
+
std::istringstream tokenStream(str);
|
|
244
|
+
while (std::getline(tokenStream, token, delimiter)) {
|
|
245
|
+
tokens.push_back(token);
|
|
246
|
+
}
|
|
247
|
+
return tokens;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// The transformSpecsRaw string is always in the format:
|
|
251
|
+
//
|
|
252
|
+
// "name1, param1, param2, ...; name2, param1, param2, ...; ..."
|
|
253
|
+
//
|
|
254
|
+
// Where "nameX" is the name of the transform, and "paramX" are the parameters.
|
|
255
|
+
std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
|
|
256
|
+
std::vector<Transform*> transforms;
|
|
257
|
+
std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
|
|
258
|
+
for (const std::string& transformSpecRaw : transformSpecs) {
|
|
259
|
+
std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
|
|
260
|
+
TORCH_CHECK(
|
|
261
|
+
transformSpec.size() >= 1,
|
|
262
|
+
"Invalid transform spec: " + transformSpecRaw);
|
|
263
|
+
|
|
264
|
+
auto name = transformSpec[0];
|
|
265
|
+
if (name == "resize") {
|
|
266
|
+
transforms.push_back(makeResizeTransform(transformSpec));
|
|
267
|
+
} else if (name == "crop") {
|
|
268
|
+
transforms.push_back(makeCropTransform(transformSpec));
|
|
269
|
+
} else {
|
|
270
|
+
TORCH_CHECK(false, "Invalid transform name: " + name);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
return transforms;
|
|
274
|
+
}
|
|
275
|
+
|
|
168
276
|
} // namespace
|
|
169
277
|
|
|
170
278
|
// ==============================
|
|
@@ -203,33 +311,47 @@ at::Tensor create_from_tensor(
|
|
|
203
311
|
realSeek = seekModeFromString(seek_mode.value());
|
|
204
312
|
}
|
|
205
313
|
|
|
206
|
-
auto
|
|
314
|
+
auto avioContextHolder =
|
|
315
|
+
std::make_unique<AVIOFromTensorContext>(video_tensor);
|
|
207
316
|
|
|
208
317
|
std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
|
|
209
|
-
std::make_unique<SingleStreamDecoder>(
|
|
318
|
+
std::make_unique<SingleStreamDecoder>(
|
|
319
|
+
std::move(avioContextHolder), realSeek);
|
|
210
320
|
return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
|
|
211
321
|
}
|
|
212
322
|
|
|
213
|
-
at::Tensor
|
|
214
|
-
|
|
215
|
-
|
|
323
|
+
at::Tensor _create_from_file_like(
|
|
324
|
+
int64_t file_like_context,
|
|
325
|
+
std::optional<std::string_view> seek_mode) {
|
|
326
|
+
auto fileLikeContext =
|
|
327
|
+
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
|
|
328
|
+
TORCH_CHECK(
|
|
329
|
+
fileLikeContext != nullptr, "file_like_context must be a valid pointer");
|
|
330
|
+
std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
|
|
331
|
+
|
|
332
|
+
SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
|
|
333
|
+
if (seek_mode.has_value()) {
|
|
334
|
+
realSeek = seekModeFromString(seek_mode.value());
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
|
|
338
|
+
std::make_unique<SingleStreamDecoder>(
|
|
339
|
+
std::move(avioContextHolder), realSeek);
|
|
216
340
|
return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
|
|
217
341
|
}
|
|
218
342
|
|
|
219
343
|
void _add_video_stream(
|
|
220
344
|
at::Tensor& decoder,
|
|
221
|
-
std::optional<int64_t> width = std::nullopt,
|
|
222
|
-
std::optional<int64_t> height = std::nullopt,
|
|
223
345
|
std::optional<int64_t> num_threads = std::nullopt,
|
|
224
346
|
std::optional<std::string_view> dimension_order = std::nullopt,
|
|
225
347
|
std::optional<int64_t> stream_index = std::nullopt,
|
|
226
|
-
std::
|
|
348
|
+
std::string_view device = "cpu",
|
|
349
|
+
std::string_view device_variant = "ffmpeg",
|
|
350
|
+
std::string_view transform_specs = "",
|
|
227
351
|
std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
|
|
228
352
|
custom_frame_mappings = std::nullopt,
|
|
229
353
|
std::optional<std::string_view> color_conversion_library = std::nullopt) {
|
|
230
354
|
VideoStreamOptions videoStreamOptions;
|
|
231
|
-
videoStreamOptions.width = width;
|
|
232
|
-
videoStreamOptions.height = height;
|
|
233
355
|
videoStreamOptions.ffmpegThreadCount = num_threads;
|
|
234
356
|
|
|
235
357
|
if (dimension_order.has_value()) {
|
|
@@ -253,37 +375,46 @@ void _add_video_stream(
|
|
|
253
375
|
". color_conversion_library must be either filtergraph or swscale.");
|
|
254
376
|
}
|
|
255
377
|
}
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
378
|
+
|
|
379
|
+
validateDeviceInterface(std::string(device), std::string(device_variant));
|
|
380
|
+
|
|
381
|
+
videoStreamOptions.device = torch::Device(std::string(device));
|
|
382
|
+
videoStreamOptions.deviceVariant = device_variant;
|
|
383
|
+
|
|
384
|
+
std::vector<Transform*> transforms =
|
|
385
|
+
makeTransforms(std::string(transform_specs));
|
|
386
|
+
|
|
259
387
|
std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
|
|
260
388
|
custom_frame_mappings.has_value()
|
|
261
389
|
? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
|
|
262
390
|
: std::nullopt;
|
|
263
391
|
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
264
392
|
videoDecoder->addVideoStream(
|
|
265
|
-
stream_index.value_or(-1),
|
|
393
|
+
stream_index.value_or(-1),
|
|
394
|
+
transforms,
|
|
395
|
+
videoStreamOptions,
|
|
396
|
+
converted_mappings);
|
|
266
397
|
}
|
|
267
398
|
|
|
268
399
|
// Add a new video stream at `stream_index` using the provided options.
|
|
269
400
|
void add_video_stream(
|
|
270
401
|
at::Tensor& decoder,
|
|
271
|
-
std::optional<int64_t> width = std::nullopt,
|
|
272
|
-
std::optional<int64_t> height = std::nullopt,
|
|
273
402
|
std::optional<int64_t> num_threads = std::nullopt,
|
|
274
403
|
std::optional<std::string_view> dimension_order = std::nullopt,
|
|
275
404
|
std::optional<int64_t> stream_index = std::nullopt,
|
|
276
|
-
std::
|
|
405
|
+
std::string_view device = "cpu",
|
|
406
|
+
std::string_view device_variant = "ffmpeg",
|
|
407
|
+
std::string_view transform_specs = "",
|
|
277
408
|
const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
|
|
278
409
|
custom_frame_mappings = std::nullopt) {
|
|
279
410
|
_add_video_stream(
|
|
280
411
|
decoder,
|
|
281
|
-
width,
|
|
282
|
-
height,
|
|
283
412
|
num_threads,
|
|
284
413
|
dimension_order,
|
|
285
414
|
stream_index,
|
|
286
415
|
device,
|
|
416
|
+
device_variant,
|
|
417
|
+
transform_specs,
|
|
287
418
|
custom_frame_mappings);
|
|
288
419
|
}
|
|
289
420
|
|
|
@@ -344,11 +475,9 @@ OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index) {
|
|
|
344
475
|
// Return the frames at given indices for a given stream
|
|
345
476
|
OpsFrameBatchOutput get_frames_at_indices(
|
|
346
477
|
at::Tensor& decoder,
|
|
347
|
-
at::
|
|
478
|
+
const at::Tensor& frame_indices) {
|
|
348
479
|
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
349
|
-
|
|
350
|
-
frame_indices.begin(), frame_indices.end());
|
|
351
|
-
auto result = videoDecoder->getFramesAtIndices(frameIndicesVec);
|
|
480
|
+
auto result = videoDecoder->getFramesAtIndices(frame_indices);
|
|
352
481
|
return makeOpsFrameBatchOutput(result);
|
|
353
482
|
}
|
|
354
483
|
|
|
@@ -367,10 +496,9 @@ OpsFrameBatchOutput get_frames_in_range(
|
|
|
367
496
|
// Return the frames at given ptss for a given stream
|
|
368
497
|
OpsFrameBatchOutput get_frames_by_pts(
|
|
369
498
|
at::Tensor& decoder,
|
|
370
|
-
at::
|
|
499
|
+
const at::Tensor& timestamps) {
|
|
371
500
|
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
372
|
-
|
|
373
|
-
auto result = videoDecoder->getFramesPlayedAt(timestampsVec);
|
|
501
|
+
auto result = videoDecoder->getFramesPlayedAt(timestamps);
|
|
374
502
|
return makeOpsFrameBatchOutput(result);
|
|
375
503
|
}
|
|
376
504
|
|
|
@@ -441,6 +569,92 @@ at::Tensor encode_audio_to_tensor(
|
|
|
441
569
|
.encodeToTensor();
|
|
442
570
|
}
|
|
443
571
|
|
|
572
|
+
void _encode_audio_to_file_like(
|
|
573
|
+
const at::Tensor& samples,
|
|
574
|
+
int64_t sample_rate,
|
|
575
|
+
std::string_view format,
|
|
576
|
+
int64_t file_like_context,
|
|
577
|
+
std::optional<int64_t> bit_rate = std::nullopt,
|
|
578
|
+
std::optional<int64_t> num_channels = std::nullopt,
|
|
579
|
+
std::optional<int64_t> desired_sample_rate = std::nullopt) {
|
|
580
|
+
auto fileLikeContext =
|
|
581
|
+
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
|
|
582
|
+
TORCH_CHECK(
|
|
583
|
+
fileLikeContext != nullptr, "file_like_context must be a valid pointer");
|
|
584
|
+
std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
|
|
585
|
+
|
|
586
|
+
AudioStreamOptions audioStreamOptions;
|
|
587
|
+
audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
|
|
588
|
+
audioStreamOptions.numChannels =
|
|
589
|
+
validateOptionalInt64ToInt(num_channels, "num_channels");
|
|
590
|
+
audioStreamOptions.sampleRate =
|
|
591
|
+
validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
|
|
592
|
+
|
|
593
|
+
AudioEncoder encoder(
|
|
594
|
+
samples,
|
|
595
|
+
validateInt64ToInt(sample_rate, "sample_rate"),
|
|
596
|
+
format,
|
|
597
|
+
std::move(avioContextHolder),
|
|
598
|
+
audioStreamOptions);
|
|
599
|
+
encoder.encode();
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
void encode_video_to_file(
|
|
603
|
+
const at::Tensor& frames,
|
|
604
|
+
int64_t frame_rate,
|
|
605
|
+
std::string_view file_name,
|
|
606
|
+
std::optional<int64_t> crf = std::nullopt) {
|
|
607
|
+
VideoStreamOptions videoStreamOptions;
|
|
608
|
+
videoStreamOptions.crf = crf;
|
|
609
|
+
VideoEncoder(
|
|
610
|
+
frames,
|
|
611
|
+
validateInt64ToInt(frame_rate, "frame_rate"),
|
|
612
|
+
file_name,
|
|
613
|
+
videoStreamOptions)
|
|
614
|
+
.encode();
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
at::Tensor encode_video_to_tensor(
|
|
618
|
+
const at::Tensor& frames,
|
|
619
|
+
int64_t frame_rate,
|
|
620
|
+
std::string_view format,
|
|
621
|
+
std::optional<int64_t> crf = std::nullopt) {
|
|
622
|
+
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
|
|
623
|
+
VideoStreamOptions videoStreamOptions;
|
|
624
|
+
videoStreamOptions.crf = crf;
|
|
625
|
+
return VideoEncoder(
|
|
626
|
+
frames,
|
|
627
|
+
validateInt64ToInt(frame_rate, "frame_rate"),
|
|
628
|
+
format,
|
|
629
|
+
std::move(avioContextHolder),
|
|
630
|
+
videoStreamOptions)
|
|
631
|
+
.encodeToTensor();
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
void _encode_video_to_file_like(
|
|
635
|
+
const at::Tensor& frames,
|
|
636
|
+
int64_t frame_rate,
|
|
637
|
+
std::string_view format,
|
|
638
|
+
int64_t file_like_context,
|
|
639
|
+
std::optional<int64_t> crf = std::nullopt) {
|
|
640
|
+
auto fileLikeContext =
|
|
641
|
+
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
|
|
642
|
+
TORCH_CHECK(
|
|
643
|
+
fileLikeContext != nullptr, "file_like_context must be a valid pointer");
|
|
644
|
+
std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
|
|
645
|
+
|
|
646
|
+
VideoStreamOptions videoStreamOptions;
|
|
647
|
+
videoStreamOptions.crf = crf;
|
|
648
|
+
|
|
649
|
+
VideoEncoder encoder(
|
|
650
|
+
frames,
|
|
651
|
+
validateInt64ToInt(frame_rate, "frame_rate"),
|
|
652
|
+
format,
|
|
653
|
+
std::move(avioContextHolder),
|
|
654
|
+
videoStreamOptions);
|
|
655
|
+
encoder.encode();
|
|
656
|
+
}
|
|
657
|
+
|
|
444
658
|
// For testing only. We need to implement this operation as a core library
|
|
445
659
|
// function because what we're testing is round-tripping pts values as
|
|
446
660
|
// double-precision floating point numbers from C++ to Python and back to C++.
|
|
@@ -682,6 +896,11 @@ std::string _get_json_ffmpeg_library_versions() {
|
|
|
682
896
|
return ss.str();
|
|
683
897
|
}
|
|
684
898
|
|
|
899
|
+
std::string get_backend_details(at::Tensor& decoder) {
|
|
900
|
+
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
|
|
901
|
+
return videoDecoder->getDeviceInterfaceDetails();
|
|
902
|
+
}
|
|
903
|
+
|
|
685
904
|
// Scans video packets to get more accurate metadata like frame count, exact
|
|
686
905
|
// keyframe positions, etc. Exact keyframe positions are useful for efficient
|
|
687
906
|
// accurate seeking. Note that this function reads the entire video but it does
|
|
@@ -694,7 +913,7 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
|
|
|
694
913
|
TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
|
|
695
914
|
m.impl("create_from_file", &create_from_file);
|
|
696
915
|
m.impl("create_from_tensor", &create_from_tensor);
|
|
697
|
-
m.impl("
|
|
916
|
+
m.impl("_create_from_file_like", &_create_from_file_like);
|
|
698
917
|
m.impl(
|
|
699
918
|
"_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
|
|
700
919
|
}
|
|
@@ -702,6 +921,10 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
|
|
|
702
921
|
TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
|
|
703
922
|
m.impl("encode_audio_to_file", &encode_audio_to_file);
|
|
704
923
|
m.impl("encode_audio_to_tensor", &encode_audio_to_tensor);
|
|
924
|
+
m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like);
|
|
925
|
+
m.impl("encode_video_to_file", &encode_video_to_file);
|
|
926
|
+
m.impl("encode_video_to_tensor", &encode_video_to_tensor);
|
|
927
|
+
m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
|
|
705
928
|
m.impl("seek_to_pts", &seek_to_pts);
|
|
706
929
|
m.impl("add_video_stream", &add_video_stream);
|
|
707
930
|
m.impl("_add_video_stream", &_add_video_stream);
|
|
@@ -722,6 +945,8 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
|
|
|
722
945
|
m.impl(
|
|
723
946
|
"scan_all_streams_to_update_metadata",
|
|
724
947
|
&scan_all_streams_to_update_metadata);
|
|
948
|
+
|
|
949
|
+
m.impl("_get_backend_details", &get_backend_details);
|
|
725
950
|
}
|
|
726
951
|
|
|
727
952
|
} // namespace facebook::torchcodec
|
|
@@ -44,6 +44,10 @@ if (LINUX)
|
|
|
44
44
|
f7_sha256
|
|
45
45
|
1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
|
|
46
46
|
)
|
|
47
|
+
set(
|
|
48
|
+
f8_sha256
|
|
49
|
+
c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
|
|
50
|
+
)
|
|
47
51
|
set(
|
|
48
52
|
f4_library_file_names
|
|
49
53
|
libavutil.so.56
|
|
@@ -84,6 +88,16 @@ if (LINUX)
|
|
|
84
88
|
libswscale.so.8
|
|
85
89
|
libswresample.so.5
|
|
86
90
|
)
|
|
91
|
+
set(
|
|
92
|
+
f8_library_file_names
|
|
93
|
+
libavutil.so.60
|
|
94
|
+
libavcodec.so.62
|
|
95
|
+
libavformat.so.62
|
|
96
|
+
libavdevice.so.62
|
|
97
|
+
libavfilter.so.11
|
|
98
|
+
libswscale.so.9
|
|
99
|
+
libswresample.so.6
|
|
100
|
+
)
|
|
87
101
|
elseif (APPLE)
|
|
88
102
|
set(lib_dir "lib")
|
|
89
103
|
set(
|
|
@@ -106,6 +120,10 @@ elseif (APPLE)
|
|
|
106
120
|
f7_sha256
|
|
107
121
|
48a4fc8ce098305cfd4a58f40889249c523ca3c285f66ba704b5bad0e3ada53a
|
|
108
122
|
)
|
|
123
|
+
set(
|
|
124
|
+
f8_sha256
|
|
125
|
+
beb936b76f25d2621228a12cdb67c9ae3d1eff7aa713ef8d1167ebf0c25bd5ec
|
|
126
|
+
)
|
|
109
127
|
|
|
110
128
|
set(
|
|
111
129
|
f4_library_file_names
|
|
@@ -147,6 +165,16 @@ elseif (APPLE)
|
|
|
147
165
|
libswscale.8.dylib
|
|
148
166
|
libswresample.5.dylib
|
|
149
167
|
)
|
|
168
|
+
set(
|
|
169
|
+
f8_library_file_names
|
|
170
|
+
libavutil.60.dylib
|
|
171
|
+
libavcodec.62.dylib
|
|
172
|
+
libavformat.62.dylib
|
|
173
|
+
libavdevice.62.dylib
|
|
174
|
+
libavfilter.11.dylib
|
|
175
|
+
libswscale.9.dylib
|
|
176
|
+
libswresample.6.dylib
|
|
177
|
+
)
|
|
150
178
|
|
|
151
179
|
elseif (WIN32)
|
|
152
180
|
set(lib_dir "bin")
|
|
@@ -170,6 +198,10 @@ elseif (WIN32)
|
|
|
170
198
|
f7_sha256
|
|
171
199
|
ae391ace382330e912793b70b68529ee7c91026d2869b4df7e7c3e7d3656bdd5
|
|
172
200
|
)
|
|
201
|
+
set(
|
|
202
|
+
f8_sha256
|
|
203
|
+
bac845ac79876b104959cb0e7b9dec772a261116344dd17d2f97e7ddfac4a73f
|
|
204
|
+
)
|
|
173
205
|
|
|
174
206
|
set(
|
|
175
207
|
f4_library_file_names
|
|
@@ -211,6 +243,16 @@ elseif (WIN32)
|
|
|
211
243
|
swscale.lib
|
|
212
244
|
swresample.lib
|
|
213
245
|
)
|
|
246
|
+
set(
|
|
247
|
+
f8_library_file_names
|
|
248
|
+
avutil.lib
|
|
249
|
+
avcodec.lib
|
|
250
|
+
avformat.lib
|
|
251
|
+
avdevice.lib
|
|
252
|
+
avfilter.lib
|
|
253
|
+
swscale.lib
|
|
254
|
+
swresample.lib
|
|
255
|
+
)
|
|
214
256
|
else()
|
|
215
257
|
message(
|
|
216
258
|
FATAL_ERROR
|
|
@@ -242,19 +284,27 @@ FetchContent_Declare(
|
|
|
242
284
|
URL_HASH
|
|
243
285
|
SHA256=${f7_sha256}
|
|
244
286
|
)
|
|
287
|
+
FetchContent_Declare(
|
|
288
|
+
f8
|
|
289
|
+
URL ${platform_url}/8.0.tar.gz
|
|
290
|
+
URL_HASH
|
|
291
|
+
SHA256=${f8_sha256}
|
|
292
|
+
)
|
|
245
293
|
|
|
246
|
-
FetchContent_MakeAvailable(f4 f5 f6 f7)
|
|
294
|
+
FetchContent_MakeAvailable(f4 f5 f6 f7 f8)
|
|
247
295
|
|
|
248
296
|
add_library(ffmpeg4 INTERFACE)
|
|
249
297
|
add_library(ffmpeg5 INTERFACE)
|
|
250
298
|
add_library(ffmpeg6 INTERFACE)
|
|
251
299
|
add_library(ffmpeg7 INTERFACE)
|
|
300
|
+
add_library(ffmpeg8 INTERFACE)
|
|
252
301
|
|
|
253
302
|
# Note: the f?_SOURCE_DIR variables were set by FetchContent_MakeAvailable
|
|
254
303
|
target_include_directories(ffmpeg4 INTERFACE ${f4_SOURCE_DIR}/include)
|
|
255
304
|
target_include_directories(ffmpeg5 INTERFACE ${f5_SOURCE_DIR}/include)
|
|
256
305
|
target_include_directories(ffmpeg6 INTERFACE ${f6_SOURCE_DIR}/include)
|
|
257
306
|
target_include_directories(ffmpeg7 INTERFACE ${f7_SOURCE_DIR}/include)
|
|
307
|
+
target_include_directories(ffmpeg8 INTERFACE ${f8_SOURCE_DIR}/include)
|
|
258
308
|
|
|
259
309
|
|
|
260
310
|
list(
|
|
@@ -277,6 +327,11 @@ list(
|
|
|
277
327
|
PREPEND ${f7_SOURCE_DIR}/${lib_dir}/
|
|
278
328
|
OUTPUT_VARIABLE f7_library_paths
|
|
279
329
|
)
|
|
330
|
+
list(
|
|
331
|
+
TRANSFORM f8_library_file_names
|
|
332
|
+
PREPEND ${f8_SOURCE_DIR}/${lib_dir}/
|
|
333
|
+
OUTPUT_VARIABLE f8_library_paths
|
|
334
|
+
)
|
|
280
335
|
|
|
281
336
|
target_link_libraries(
|
|
282
337
|
ffmpeg4
|
|
@@ -298,3 +353,8 @@ target_link_libraries(
|
|
|
298
353
|
INTERFACE
|
|
299
354
|
${f7_library_paths}
|
|
300
355
|
)
|
|
356
|
+
target_link_libraries(
|
|
357
|
+
ffmpeg8
|
|
358
|
+
INTERFACE
|
|
359
|
+
${f8_library_paths}
|
|
360
|
+
)
|