torchcodec 0.7.0__cp313-cp313-win_amd64.whl → 0.8.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (66) hide show
  1. torchcodec/_core/AVIOTensorContext.cpp +23 -16
  2. torchcodec/_core/AVIOTensorContext.h +2 -1
  3. torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
  4. torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
  5. torchcodec/_core/CMakeLists.txt +18 -3
  6. torchcodec/_core/CUDACommon.cpp +330 -0
  7. torchcodec/_core/CUDACommon.h +51 -0
  8. torchcodec/_core/Cache.h +6 -20
  9. torchcodec/_core/CpuDeviceInterface.cpp +195 -108
  10. torchcodec/_core/CpuDeviceInterface.h +84 -19
  11. torchcodec/_core/CudaDeviceInterface.cpp +227 -376
  12. torchcodec/_core/CudaDeviceInterface.h +38 -6
  13. torchcodec/_core/DeviceInterface.cpp +57 -19
  14. torchcodec/_core/DeviceInterface.h +97 -16
  15. torchcodec/_core/Encoder.cpp +346 -9
  16. torchcodec/_core/Encoder.h +62 -1
  17. torchcodec/_core/FFMPEGCommon.cpp +190 -3
  18. torchcodec/_core/FFMPEGCommon.h +27 -1
  19. torchcodec/_core/FilterGraph.cpp +30 -22
  20. torchcodec/_core/FilterGraph.h +15 -1
  21. torchcodec/_core/Frame.cpp +22 -7
  22. torchcodec/_core/Frame.h +15 -61
  23. torchcodec/_core/Metadata.h +2 -2
  24. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  25. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  26. torchcodec/_core/NVDECCache.cpp +60 -0
  27. torchcodec/_core/NVDECCache.h +102 -0
  28. torchcodec/_core/SingleStreamDecoder.cpp +196 -201
  29. torchcodec/_core/SingleStreamDecoder.h +42 -15
  30. torchcodec/_core/StreamOptions.h +16 -6
  31. torchcodec/_core/Transform.cpp +87 -0
  32. torchcodec/_core/Transform.h +84 -0
  33. torchcodec/_core/__init__.py +4 -0
  34. torchcodec/_core/custom_ops.cpp +257 -32
  35. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
  36. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  37. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  38. torchcodec/_core/ops.py +147 -44
  39. torchcodec/_core/pybind_ops.cpp +22 -59
  40. torchcodec/_samplers/video_clip_sampler.py +7 -19
  41. torchcodec/decoders/__init__.py +1 -0
  42. torchcodec/decoders/_decoder_utils.py +61 -1
  43. torchcodec/decoders/_video_decoder.py +46 -20
  44. torchcodec/libtorchcodec_core4.dll +0 -0
  45. torchcodec/libtorchcodec_core5.dll +0 -0
  46. torchcodec/libtorchcodec_core6.dll +0 -0
  47. torchcodec/libtorchcodec_core7.dll +0 -0
  48. torchcodec/libtorchcodec_core8.dll +0 -0
  49. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  50. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  51. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  52. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  53. torchcodec/libtorchcodec_custom_ops8.dll +0 -0
  54. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  55. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  56. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  57. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  58. torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
  59. torchcodec/samplers/_time_based.py +8 -0
  60. torchcodec/version.py +1 -1
  61. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
  62. torchcodec-0.8.1.dist-info/RECORD +82 -0
  63. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
  64. torchcodec-0.7.0.dist-info/RECORD +0 -67
  65. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
  66. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@
10
10
  #include <string>
11
11
  #include "c10/core/SymIntArrayRef.h"
12
12
  #include "c10/util/Exception.h"
13
+ #include "src/torchcodec/_core/AVIOFileLikeContext.h"
13
14
  #include "src/torchcodec/_core/AVIOTensorContext.h"
14
15
  #include "src/torchcodec/_core/Encoder.h"
15
16
  #include "src/torchcodec/_core/SingleStreamDecoder.h"
@@ -33,13 +34,22 @@ TORCH_LIBRARY(torchcodec_ns, m) {
33
34
  "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
34
35
  m.def(
35
36
  "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
37
+ m.def(
38
+ "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
39
+ m.def(
40
+ "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
41
+ m.def(
42
+ "encode_video_to_tensor(Tensor frames, int frame_rate, str format, int? crf=None) -> Tensor");
43
+ m.def(
44
+ "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, int? crf=None) -> ()");
36
45
  m.def(
37
46
  "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
38
- m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
39
47
  m.def(
40
- "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
48
+ "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
49
+ m.def(
50
+ "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"ffmpeg\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
41
51
  m.def(
42
- "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
52
+ "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"ffmpeg\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
43
53
  m.def(
44
54
  "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
45
55
  m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -49,7 +59,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
49
59
  m.def(
50
60
  "get_frame_at_index(Tensor(a!) decoder, *, int frame_index) -> (Tensor, Tensor, Tensor)");
51
61
  m.def(
52
- "get_frames_at_indices(Tensor(a!) decoder, *, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
62
+ "get_frames_at_indices(Tensor(a!) decoder, *, Tensor frame_indices) -> (Tensor, Tensor, Tensor)");
53
63
  m.def(
54
64
  "get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
55
65
  m.def(
@@ -57,13 +67,14 @@ TORCH_LIBRARY(torchcodec_ns, m) {
57
67
  m.def(
58
68
  "get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
59
69
  m.def(
60
- "get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
70
+ "get_frames_by_pts(Tensor(a!) decoder, *, Tensor timestamps) -> (Tensor, Tensor, Tensor)");
61
71
  m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
62
72
  m.def("get_json_metadata(Tensor(a!) decoder) -> str");
63
73
  m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
64
74
  m.def(
65
75
  "get_stream_json_metadata(Tensor(a!) decoder, int stream_index) -> str");
66
76
  m.def("_get_json_ffmpeg_library_versions() -> str");
77
+ m.def("_get_backend_details(Tensor(a!) decoder) -> str");
67
78
  m.def(
68
79
  "_test_frame_pts_equality(Tensor(a!) decoder, *, int frame_index, float pts_seconds_to_test) -> bool");
69
80
  m.def("scan_all_streams_to_update_metadata(Tensor(a!) decoder) -> ()");
@@ -165,6 +176,103 @@ std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
165
176
  return ss.str();
166
177
  }
167
178
 
179
+ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
180
+ if (seekMode == "exact") {
181
+ return SingleStreamDecoder::SeekMode::exact;
182
+ } else if (seekMode == "approximate") {
183
+ return SingleStreamDecoder::SeekMode::approximate;
184
+ } else if (seekMode == "custom_frame_mappings") {
185
+ return SingleStreamDecoder::SeekMode::custom_frame_mappings;
186
+ } else {
187
+ TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
188
+ }
189
+ }
190
+
191
+ int checkedToPositiveInt(const std::string& str) {
192
+ int ret = 0;
193
+ try {
194
+ ret = std::stoi(str);
195
+ } catch (const std::invalid_argument&) {
196
+ TORCH_CHECK(false, "String cannot be converted to an int:" + str);
197
+ } catch (const std::out_of_range&) {
198
+ TORCH_CHECK(false, "String would become integer out of range:" + str);
199
+ }
200
+ TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
201
+ return ret;
202
+ }
203
+
204
+ // Resize transform specs take the form:
205
+ //
206
+ // "resize, <height>, <width>"
207
+ //
208
+ // Where "resize" is the string literal and <height> and <width> are positive
209
+ // integers.
210
+ Transform* makeResizeTransform(
211
+ const std::vector<std::string>& resizeTransformSpec) {
212
+ TORCH_CHECK(
213
+ resizeTransformSpec.size() == 3,
214
+ "resizeTransformSpec must have 3 elements including its name");
215
+ int height = checkedToPositiveInt(resizeTransformSpec[1]);
216
+ int width = checkedToPositiveInt(resizeTransformSpec[2]);
217
+ return new ResizeTransform(FrameDims(height, width));
218
+ }
219
+
220
+ // Crop transform specs take the form:
221
+ //
222
+ // "crop, <height>, <width>, <x>, <y>"
223
+ //
224
+ // Where "crop" is the string literal and <height>, <width>, <x> and <y> are
225
+ // positive integers. <x> and <y> are the x and y coordinates of the top left
226
+ // corner of the crop. Note that we follow the PyTorch convention of (height,
227
+ // width) for specifying image dimensions; FFmpeg uses (width, height).
228
+ Transform* makeCropTransform(
229
+ const std::vector<std::string>& cropTransformSpec) {
230
+ TORCH_CHECK(
231
+ cropTransformSpec.size() == 5,
232
+ "cropTransformSpec must have 5 elements including its name");
233
+ int height = checkedToPositiveInt(cropTransformSpec[1]);
234
+ int width = checkedToPositiveInt(cropTransformSpec[2]);
235
+ int x = checkedToPositiveInt(cropTransformSpec[3]);
236
+ int y = checkedToPositiveInt(cropTransformSpec[4]);
237
+ return new CropTransform(FrameDims(height, width), x, y);
238
+ }
239
+
240
+ std::vector<std::string> split(const std::string& str, char delimiter) {
241
+ std::vector<std::string> tokens;
242
+ std::string token;
243
+ std::istringstream tokenStream(str);
244
+ while (std::getline(tokenStream, token, delimiter)) {
245
+ tokens.push_back(token);
246
+ }
247
+ return tokens;
248
+ }
249
+
250
+ // The transformSpecsRaw string is always in the format:
251
+ //
252
+ // "name1, param1, param2, ...; name2, param1, param2, ...; ..."
253
+ //
254
+ // Where "nameX" is the name of the transform, and "paramX" are the parameters.
255
+ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
256
+ std::vector<Transform*> transforms;
257
+ std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
258
+ for (const std::string& transformSpecRaw : transformSpecs) {
259
+ std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
260
+ TORCH_CHECK(
261
+ transformSpec.size() >= 1,
262
+ "Invalid transform spec: " + transformSpecRaw);
263
+
264
+ auto name = transformSpec[0];
265
+ if (name == "resize") {
266
+ transforms.push_back(makeResizeTransform(transformSpec));
267
+ } else if (name == "crop") {
268
+ transforms.push_back(makeCropTransform(transformSpec));
269
+ } else {
270
+ TORCH_CHECK(false, "Invalid transform name: " + name);
271
+ }
272
+ }
273
+ return transforms;
274
+ }
275
+
168
276
  } // namespace
169
277
 
170
278
  // ==============================
@@ -203,33 +311,47 @@ at::Tensor create_from_tensor(
203
311
  realSeek = seekModeFromString(seek_mode.value());
204
312
  }
205
313
 
206
- auto contextHolder = std::make_unique<AVIOFromTensorContext>(video_tensor);
314
+ auto avioContextHolder =
315
+ std::make_unique<AVIOFromTensorContext>(video_tensor);
207
316
 
208
317
  std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
209
- std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
318
+ std::make_unique<SingleStreamDecoder>(
319
+ std::move(avioContextHolder), realSeek);
210
320
  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
211
321
  }
212
322
 
213
- at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
214
- auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
215
- std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
323
+ at::Tensor _create_from_file_like(
324
+ int64_t file_like_context,
325
+ std::optional<std::string_view> seek_mode) {
326
+ auto fileLikeContext =
327
+ reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
328
+ TORCH_CHECK(
329
+ fileLikeContext != nullptr, "file_like_context must be a valid pointer");
330
+ std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
331
+
332
+ SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
333
+ if (seek_mode.has_value()) {
334
+ realSeek = seekModeFromString(seek_mode.value());
335
+ }
336
+
337
+ std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
338
+ std::make_unique<SingleStreamDecoder>(
339
+ std::move(avioContextHolder), realSeek);
216
340
  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
217
341
  }
218
342
 
219
343
  void _add_video_stream(
220
344
  at::Tensor& decoder,
221
- std::optional<int64_t> width = std::nullopt,
222
- std::optional<int64_t> height = std::nullopt,
223
345
  std::optional<int64_t> num_threads = std::nullopt,
224
346
  std::optional<std::string_view> dimension_order = std::nullopt,
225
347
  std::optional<int64_t> stream_index = std::nullopt,
226
- std::optional<std::string_view> device = std::nullopt,
348
+ std::string_view device = "cpu",
349
+ std::string_view device_variant = "ffmpeg",
350
+ std::string_view transform_specs = "",
227
351
  std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
228
352
  custom_frame_mappings = std::nullopt,
229
353
  std::optional<std::string_view> color_conversion_library = std::nullopt) {
230
354
  VideoStreamOptions videoStreamOptions;
231
- videoStreamOptions.width = width;
232
- videoStreamOptions.height = height;
233
355
  videoStreamOptions.ffmpegThreadCount = num_threads;
234
356
 
235
357
  if (dimension_order.has_value()) {
@@ -253,37 +375,46 @@ void _add_video_stream(
253
375
  ". color_conversion_library must be either filtergraph or swscale.");
254
376
  }
255
377
  }
256
- if (device.has_value()) {
257
- videoStreamOptions.device = createTorchDevice(std::string(device.value()));
258
- }
378
+
379
+ validateDeviceInterface(std::string(device), std::string(device_variant));
380
+
381
+ videoStreamOptions.device = torch::Device(std::string(device));
382
+ videoStreamOptions.deviceVariant = device_variant;
383
+
384
+ std::vector<Transform*> transforms =
385
+ makeTransforms(std::string(transform_specs));
386
+
259
387
  std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
260
388
  custom_frame_mappings.has_value()
261
389
  ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
262
390
  : std::nullopt;
263
391
  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
264
392
  videoDecoder->addVideoStream(
265
- stream_index.value_or(-1), videoStreamOptions, converted_mappings);
393
+ stream_index.value_or(-1),
394
+ transforms,
395
+ videoStreamOptions,
396
+ converted_mappings);
266
397
  }
267
398
 
268
399
  // Add a new video stream at `stream_index` using the provided options.
269
400
  void add_video_stream(
270
401
  at::Tensor& decoder,
271
- std::optional<int64_t> width = std::nullopt,
272
- std::optional<int64_t> height = std::nullopt,
273
402
  std::optional<int64_t> num_threads = std::nullopt,
274
403
  std::optional<std::string_view> dimension_order = std::nullopt,
275
404
  std::optional<int64_t> stream_index = std::nullopt,
276
- std::optional<std::string_view> device = std::nullopt,
405
+ std::string_view device = "cpu",
406
+ std::string_view device_variant = "ffmpeg",
407
+ std::string_view transform_specs = "",
277
408
  const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
278
409
  custom_frame_mappings = std::nullopt) {
279
410
  _add_video_stream(
280
411
  decoder,
281
- width,
282
- height,
283
412
  num_threads,
284
413
  dimension_order,
285
414
  stream_index,
286
415
  device,
416
+ device_variant,
417
+ transform_specs,
287
418
  custom_frame_mappings);
288
419
  }
289
420
 
@@ -344,11 +475,9 @@ OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index) {
344
475
  // Return the frames at given indices for a given stream
345
476
  OpsFrameBatchOutput get_frames_at_indices(
346
477
  at::Tensor& decoder,
347
- at::IntArrayRef frame_indices) {
478
+ const at::Tensor& frame_indices) {
348
479
  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
349
- std::vector<int64_t> frameIndicesVec(
350
- frame_indices.begin(), frame_indices.end());
351
- auto result = videoDecoder->getFramesAtIndices(frameIndicesVec);
480
+ auto result = videoDecoder->getFramesAtIndices(frame_indices);
352
481
  return makeOpsFrameBatchOutput(result);
353
482
  }
354
483
 
@@ -367,10 +496,9 @@ OpsFrameBatchOutput get_frames_in_range(
367
496
  // Return the frames at given ptss for a given stream
368
497
  OpsFrameBatchOutput get_frames_by_pts(
369
498
  at::Tensor& decoder,
370
- at::ArrayRef<double> timestamps) {
499
+ const at::Tensor& timestamps) {
371
500
  auto videoDecoder = unwrapTensorToGetDecoder(decoder);
372
- std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
373
- auto result = videoDecoder->getFramesPlayedAt(timestampsVec);
501
+ auto result = videoDecoder->getFramesPlayedAt(timestamps);
374
502
  return makeOpsFrameBatchOutput(result);
375
503
  }
376
504
 
@@ -441,6 +569,92 @@ at::Tensor encode_audio_to_tensor(
441
569
  .encodeToTensor();
442
570
  }
443
571
 
572
+ void _encode_audio_to_file_like(
573
+ const at::Tensor& samples,
574
+ int64_t sample_rate,
575
+ std::string_view format,
576
+ int64_t file_like_context,
577
+ std::optional<int64_t> bit_rate = std::nullopt,
578
+ std::optional<int64_t> num_channels = std::nullopt,
579
+ std::optional<int64_t> desired_sample_rate = std::nullopt) {
580
+ auto fileLikeContext =
581
+ reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
582
+ TORCH_CHECK(
583
+ fileLikeContext != nullptr, "file_like_context must be a valid pointer");
584
+ std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
585
+
586
+ AudioStreamOptions audioStreamOptions;
587
+ audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
588
+ audioStreamOptions.numChannels =
589
+ validateOptionalInt64ToInt(num_channels, "num_channels");
590
+ audioStreamOptions.sampleRate =
591
+ validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
592
+
593
+ AudioEncoder encoder(
594
+ samples,
595
+ validateInt64ToInt(sample_rate, "sample_rate"),
596
+ format,
597
+ std::move(avioContextHolder),
598
+ audioStreamOptions);
599
+ encoder.encode();
600
+ }
601
+
602
+ void encode_video_to_file(
603
+ const at::Tensor& frames,
604
+ int64_t frame_rate,
605
+ std::string_view file_name,
606
+ std::optional<int64_t> crf = std::nullopt) {
607
+ VideoStreamOptions videoStreamOptions;
608
+ videoStreamOptions.crf = crf;
609
+ VideoEncoder(
610
+ frames,
611
+ validateInt64ToInt(frame_rate, "frame_rate"),
612
+ file_name,
613
+ videoStreamOptions)
614
+ .encode();
615
+ }
616
+
617
+ at::Tensor encode_video_to_tensor(
618
+ const at::Tensor& frames,
619
+ int64_t frame_rate,
620
+ std::string_view format,
621
+ std::optional<int64_t> crf = std::nullopt) {
622
+ auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
623
+ VideoStreamOptions videoStreamOptions;
624
+ videoStreamOptions.crf = crf;
625
+ return VideoEncoder(
626
+ frames,
627
+ validateInt64ToInt(frame_rate, "frame_rate"),
628
+ format,
629
+ std::move(avioContextHolder),
630
+ videoStreamOptions)
631
+ .encodeToTensor();
632
+ }
633
+
634
+ void _encode_video_to_file_like(
635
+ const at::Tensor& frames,
636
+ int64_t frame_rate,
637
+ std::string_view format,
638
+ int64_t file_like_context,
639
+ std::optional<int64_t> crf = std::nullopt) {
640
+ auto fileLikeContext =
641
+ reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
642
+ TORCH_CHECK(
643
+ fileLikeContext != nullptr, "file_like_context must be a valid pointer");
644
+ std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
645
+
646
+ VideoStreamOptions videoStreamOptions;
647
+ videoStreamOptions.crf = crf;
648
+
649
+ VideoEncoder encoder(
650
+ frames,
651
+ validateInt64ToInt(frame_rate, "frame_rate"),
652
+ format,
653
+ std::move(avioContextHolder),
654
+ videoStreamOptions);
655
+ encoder.encode();
656
+ }
657
+
444
658
  // For testing only. We need to implement this operation as a core library
445
659
  // function because what we're testing is round-tripping pts values as
446
660
  // double-precision floating point numbers from C++ to Python and back to C++.
@@ -682,6 +896,11 @@ std::string _get_json_ffmpeg_library_versions() {
682
896
  return ss.str();
683
897
  }
684
898
 
899
+ std::string get_backend_details(at::Tensor& decoder) {
900
+ auto videoDecoder = unwrapTensorToGetDecoder(decoder);
901
+ return videoDecoder->getDeviceInterfaceDetails();
902
+ }
903
+
685
904
  // Scans video packets to get more accurate metadata like frame count, exact
686
905
  // keyframe positions, etc. Exact keyframe positions are useful for efficient
687
906
  // accurate seeking. Note that this function reads the entire video but it does
@@ -694,7 +913,7 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
694
913
  TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
695
914
  m.impl("create_from_file", &create_from_file);
696
915
  m.impl("create_from_tensor", &create_from_tensor);
697
- m.impl("_convert_to_tensor", &_convert_to_tensor);
916
+ m.impl("_create_from_file_like", &_create_from_file_like);
698
917
  m.impl(
699
918
  "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
700
919
  }
@@ -702,6 +921,10 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
702
921
  TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
703
922
  m.impl("encode_audio_to_file", &encode_audio_to_file);
704
923
  m.impl("encode_audio_to_tensor", &encode_audio_to_tensor);
924
+ m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like);
925
+ m.impl("encode_video_to_file", &encode_video_to_file);
926
+ m.impl("encode_video_to_tensor", &encode_video_to_tensor);
927
+ m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
705
928
  m.impl("seek_to_pts", &seek_to_pts);
706
929
  m.impl("add_video_stream", &add_video_stream);
707
930
  m.impl("_add_video_stream", &_add_video_stream);
@@ -722,6 +945,8 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
722
945
  m.impl(
723
946
  "scan_all_streams_to_update_metadata",
724
947
  &scan_all_streams_to_update_metadata);
948
+
949
+ m.impl("_get_backend_details", &get_backend_details);
725
950
  }
726
951
 
727
952
  } // namespace facebook::torchcodec
@@ -44,6 +44,10 @@ if (LINUX)
44
44
  f7_sha256
45
45
  1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
46
46
  )
47
+ set(
48
+ f8_sha256
49
+ c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
50
+ )
47
51
  set(
48
52
  f4_library_file_names
49
53
  libavutil.so.56
@@ -84,6 +88,16 @@ if (LINUX)
84
88
  libswscale.so.8
85
89
  libswresample.so.5
86
90
  )
91
+ set(
92
+ f8_library_file_names
93
+ libavutil.so.60
94
+ libavcodec.so.62
95
+ libavformat.so.62
96
+ libavdevice.so.62
97
+ libavfilter.so.11
98
+ libswscale.so.9
99
+ libswresample.so.6
100
+ )
87
101
  elseif (APPLE)
88
102
  set(lib_dir "lib")
89
103
  set(
@@ -106,6 +120,10 @@ elseif (APPLE)
106
120
  f7_sha256
107
121
  48a4fc8ce098305cfd4a58f40889249c523ca3c285f66ba704b5bad0e3ada53a
108
122
  )
123
+ set(
124
+ f8_sha256
125
+ beb936b76f25d2621228a12cdb67c9ae3d1eff7aa713ef8d1167ebf0c25bd5ec
126
+ )
109
127
 
110
128
  set(
111
129
  f4_library_file_names
@@ -147,6 +165,16 @@ elseif (APPLE)
147
165
  libswscale.8.dylib
148
166
  libswresample.5.dylib
149
167
  )
168
+ set(
169
+ f8_library_file_names
170
+ libavutil.60.dylib
171
+ libavcodec.62.dylib
172
+ libavformat.62.dylib
173
+ libavdevice.62.dylib
174
+ libavfilter.11.dylib
175
+ libswscale.9.dylib
176
+ libswresample.6.dylib
177
+ )
150
178
 
151
179
  elseif (WIN32)
152
180
  set(lib_dir "bin")
@@ -170,6 +198,10 @@ elseif (WIN32)
170
198
  f7_sha256
171
199
  ae391ace382330e912793b70b68529ee7c91026d2869b4df7e7c3e7d3656bdd5
172
200
  )
201
+ set(
202
+ f8_sha256
203
+ bac845ac79876b104959cb0e7b9dec772a261116344dd17d2f97e7ddfac4a73f
204
+ )
173
205
 
174
206
  set(
175
207
  f4_library_file_names
@@ -211,6 +243,16 @@ elseif (WIN32)
211
243
  swscale.lib
212
244
  swresample.lib
213
245
  )
246
+ set(
247
+ f8_library_file_names
248
+ avutil.lib
249
+ avcodec.lib
250
+ avformat.lib
251
+ avdevice.lib
252
+ avfilter.lib
253
+ swscale.lib
254
+ swresample.lib
255
+ )
214
256
  else()
215
257
  message(
216
258
  FATAL_ERROR
@@ -242,19 +284,27 @@ FetchContent_Declare(
242
284
  URL_HASH
243
285
  SHA256=${f7_sha256}
244
286
  )
287
+ FetchContent_Declare(
288
+ f8
289
+ URL ${platform_url}/8.0.tar.gz
290
+ URL_HASH
291
+ SHA256=${f8_sha256}
292
+ )
245
293
 
246
- FetchContent_MakeAvailable(f4 f5 f6 f7)
294
+ FetchContent_MakeAvailable(f4 f5 f6 f7 f8)
247
295
 
248
296
  add_library(ffmpeg4 INTERFACE)
249
297
  add_library(ffmpeg5 INTERFACE)
250
298
  add_library(ffmpeg6 INTERFACE)
251
299
  add_library(ffmpeg7 INTERFACE)
300
+ add_library(ffmpeg8 INTERFACE)
252
301
 
253
302
  # Note: the f?_SOURCE_DIR variables were set by FetchContent_MakeAvailable
254
303
  target_include_directories(ffmpeg4 INTERFACE ${f4_SOURCE_DIR}/include)
255
304
  target_include_directories(ffmpeg5 INTERFACE ${f5_SOURCE_DIR}/include)
256
305
  target_include_directories(ffmpeg6 INTERFACE ${f6_SOURCE_DIR}/include)
257
306
  target_include_directories(ffmpeg7 INTERFACE ${f7_SOURCE_DIR}/include)
307
+ target_include_directories(ffmpeg8 INTERFACE ${f8_SOURCE_DIR}/include)
258
308
 
259
309
 
260
310
  list(
@@ -277,6 +327,11 @@ list(
277
327
  PREPEND ${f7_SOURCE_DIR}/${lib_dir}/
278
328
  OUTPUT_VARIABLE f7_library_paths
279
329
  )
330
+ list(
331
+ TRANSFORM f8_library_file_names
332
+ PREPEND ${f8_SOURCE_DIR}/${lib_dir}/
333
+ OUTPUT_VARIABLE f8_library_paths
334
+ )
280
335
 
281
336
  target_link_libraries(
282
337
  ffmpeg4
@@ -298,3 +353,8 @@ target_link_libraries(
298
353
  INTERFACE
299
354
  ${f7_library_paths}
300
355
  )
356
+ target_link_libraries(
357
+ ffmpeg8
358
+ INTERFACE
359
+ ${f8_library_paths}
360
+ )