torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (61) hide show
  1. torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
  2. torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
  3. torchcodec/_core/CMakeLists.txt +36 -3
  4. torchcodec/_core/CUDACommon.cpp +315 -0
  5. torchcodec/_core/CUDACommon.h +46 -0
  6. torchcodec/_core/CpuDeviceInterface.cpp +189 -108
  7. torchcodec/_core/CpuDeviceInterface.h +81 -19
  8. torchcodec/_core/CudaDeviceInterface.cpp +211 -368
  9. torchcodec/_core/CudaDeviceInterface.h +33 -6
  10. torchcodec/_core/DeviceInterface.cpp +57 -19
  11. torchcodec/_core/DeviceInterface.h +97 -16
  12. torchcodec/_core/Encoder.cpp +302 -9
  13. torchcodec/_core/Encoder.h +51 -1
  14. torchcodec/_core/FFMPEGCommon.cpp +189 -2
  15. torchcodec/_core/FFMPEGCommon.h +18 -0
  16. torchcodec/_core/FilterGraph.cpp +28 -21
  17. torchcodec/_core/FilterGraph.h +15 -1
  18. torchcodec/_core/Frame.cpp +17 -7
  19. torchcodec/_core/Frame.h +15 -61
  20. torchcodec/_core/Metadata.h +2 -2
  21. torchcodec/_core/NVDECCache.cpp +70 -0
  22. torchcodec/_core/NVDECCache.h +104 -0
  23. torchcodec/_core/SingleStreamDecoder.cpp +202 -198
  24. torchcodec/_core/SingleStreamDecoder.h +39 -14
  25. torchcodec/_core/StreamOptions.h +16 -6
  26. torchcodec/_core/Transform.cpp +60 -0
  27. torchcodec/_core/Transform.h +59 -0
  28. torchcodec/_core/__init__.py +1 -0
  29. torchcodec/_core/custom_ops.cpp +180 -32
  30. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
  31. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  32. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  33. torchcodec/_core/ops.py +86 -43
  34. torchcodec/_core/pybind_ops.cpp +22 -59
  35. torchcodec/_samplers/video_clip_sampler.py +7 -19
  36. torchcodec/decoders/__init__.py +1 -0
  37. torchcodec/decoders/_decoder_utils.py +61 -1
  38. torchcodec/decoders/_video_decoder.py +56 -20
  39. torchcodec/libtorchcodec_core4.dll +0 -0
  40. torchcodec/libtorchcodec_core5.dll +0 -0
  41. torchcodec/libtorchcodec_core6.dll +0 -0
  42. torchcodec/libtorchcodec_core7.dll +0 -0
  43. torchcodec/libtorchcodec_core8.dll +0 -0
  44. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  45. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  46. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  47. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  48. torchcodec/libtorchcodec_custom_ops8.dll +0 -0
  49. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  50. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  51. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  52. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  53. torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
  54. torchcodec/samplers/_time_based.py +8 -0
  55. torchcodec/version.py +1 -1
  56. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/METADATA +24 -13
  57. torchcodec-0.8.0.dist-info/RECORD +80 -0
  58. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/WHEEL +1 -1
  59. torchcodec-0.7.0.dist-info/RECORD +0 -67
  60. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/licenses/LICENSE +0 -0
  61. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/top_level.txt +0 -0
torchcodec/_core/ops.py CHANGED
@@ -41,7 +41,7 @@ def load_torchcodec_shared_libraries():
41
41
  # libraries do not meet those conditions.
42
42
 
43
43
  exceptions = []
44
- for ffmpeg_major_version in (7, 6, 5, 4):
44
+ for ffmpeg_major_version in (8, 7, 6, 5, 4):
45
45
  pybind_ops_module_name = _get_pybind_ops_module_name(ffmpeg_major_version)
46
46
  decoder_library_name = f"libtorchcodec_core{ffmpeg_major_version}"
47
47
  custom_ops_library_name = f"libtorchcodec_custom_ops{ffmpeg_major_version}"
@@ -92,14 +92,20 @@ create_from_file = torch._dynamo.disallow_in_graph(
92
92
  encode_audio_to_file = torch._dynamo.disallow_in_graph(
93
93
  torch.ops.torchcodec_ns.encode_audio_to_file.default
94
94
  )
95
+ encode_video_to_file = torch._dynamo.disallow_in_graph(
96
+ torch.ops.torchcodec_ns.encode_video_to_file.default
97
+ )
95
98
  encode_audio_to_tensor = torch._dynamo.disallow_in_graph(
96
99
  torch.ops.torchcodec_ns.encode_audio_to_tensor.default
97
100
  )
101
+ _encode_audio_to_file_like = torch._dynamo.disallow_in_graph(
102
+ torch.ops.torchcodec_ns._encode_audio_to_file_like.default
103
+ )
98
104
  create_from_tensor = torch._dynamo.disallow_in_graph(
99
105
  torch.ops.torchcodec_ns.create_from_tensor.default
100
106
  )
101
- _convert_to_tensor = torch._dynamo.disallow_in_graph(
102
- torch.ops.torchcodec_ns._convert_to_tensor.default
107
+ _create_from_file_like = torch._dynamo.disallow_in_graph(
108
+ torch.ops.torchcodec_ns._create_from_file_like.default
103
109
  )
104
110
  add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default
105
111
  _add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default
@@ -108,8 +114,10 @@ seek_to_pts = torch.ops.torchcodec_ns.seek_to_pts.default
108
114
  get_next_frame = torch.ops.torchcodec_ns.get_next_frame.default
109
115
  get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
110
116
  get_frame_at_index = torch.ops.torchcodec_ns.get_frame_at_index.default
111
- get_frames_at_indices = torch.ops.torchcodec_ns.get_frames_at_indices.default
112
- get_frames_by_pts = torch.ops.torchcodec_ns.get_frames_by_pts.default
117
+ _get_frames_at_indices_tensor_input = (
118
+ torch.ops.torchcodec_ns.get_frames_at_indices.default
119
+ )
120
+ _get_frames_by_pts_tensor_input = torch.ops.torchcodec_ns.get_frames_by_pts.default
113
121
  get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
114
122
  get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
115
123
  get_frames_by_pts_in_range_audio = (
@@ -148,7 +156,12 @@ def create_from_file_like(
148
156
  file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None
149
157
  ) -> torch.Tensor:
150
158
  assert _pybind_ops is not None
151
- return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
159
+ return _create_from_file_like(
160
+ _pybind_ops.create_file_like_context(
161
+ file_like, False # False means not for writing
162
+ ),
163
+ seek_mode,
164
+ )
152
165
 
153
166
 
154
167
  def encode_audio_to_file_like(
@@ -176,35 +189,42 @@ def encode_audio_to_file_like(
176
189
  if samples.dtype != torch.float32:
177
190
  raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
178
191
 
179
- # We're having the same problem as with the decoder's create_from_file_like:
180
- # We should be able to pass a tensor directly, but this leads to a pybind
181
- # error. In order to work around this, we pass the pointer to the tensor's
182
- # data, and its shape, in order to re-construct it in C++. For this to work:
183
- # - the tensor must be float32
184
- # - the tensor must be contiguous, which is why we call contiguous().
185
- # In theory we could avoid this restriction by also passing the strides?
186
- # - IMPORTANT: the input samples tensor and its underlying data must be
187
- # alive during the call.
188
- #
189
- # A more elegant solution would be to cast the tensor into a py::object, but
190
- # casting the py::object backk to a tensor in C++ seems to lead to the same
191
- # pybing error.
192
-
193
- samples = samples.contiguous()
194
- _pybind_ops.encode_audio_to_file_like(
195
- samples.data_ptr(),
196
- list(samples.shape),
192
+ _encode_audio_to_file_like(
193
+ samples,
197
194
  sample_rate,
198
195
  format,
199
- file_like,
196
+ _pybind_ops.create_file_like_context(file_like, True), # True means for writing
200
197
  bit_rate,
201
198
  num_channels,
202
199
  desired_sample_rate,
203
200
  )
204
201
 
205
- # This check is useless but it's critical to keep it to ensures that samples
206
- # is still alive during the call to encode_audio_to_file_like.
207
- assert samples.is_contiguous()
202
+
203
+ def get_frames_at_indices(
204
+ decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, list[int]]
205
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
206
+ if isinstance(frame_indices, torch.Tensor):
207
+ # Ensure indices is the correct dtype (int64)
208
+ frame_indices = frame_indices.to(torch.int64)
209
+ else:
210
+ # Convert list to tensor for dispatch
211
+ frame_indices = torch.tensor(frame_indices)
212
+ return _get_frames_at_indices_tensor_input(decoder, frame_indices=frame_indices)
213
+
214
+
215
+ def get_frames_by_pts(
216
+ decoder: torch.Tensor, *, timestamps: Union[torch.Tensor, list[float]]
217
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
218
+ if isinstance(timestamps, torch.Tensor):
219
+ # Ensure indices is the correct dtype (float64)
220
+ timestamps = timestamps.to(torch.float64)
221
+ else:
222
+ # Convert list to tensor for dispatch
223
+ try:
224
+ timestamps = torch.tensor(timestamps, dtype=torch.float64)
225
+ except Exception as e:
226
+ raise ValueError("Couldn't convert timestamps input to a tensor") from e
227
+ return _get_frames_by_pts_tensor_input(decoder, timestamps=timestamps)
208
228
 
209
229
 
210
230
  # ==============================
@@ -215,6 +235,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
215
235
  return torch.empty([], dtype=torch.long)
216
236
 
217
237
 
238
+ @register_fake("torchcodec_ns::_create_from_file_like")
239
+ def _create_from_file_like_abstract(
240
+ file_like: int, seek_mode: Optional[str]
241
+ ) -> torch.Tensor:
242
+ return torch.empty([], dtype=torch.long)
243
+
244
+
218
245
  @register_fake("torchcodec_ns::encode_audio_to_file")
219
246
  def encode_audio_to_file_abstract(
220
247
  samples: torch.Tensor,
@@ -227,6 +254,16 @@ def encode_audio_to_file_abstract(
227
254
  return
228
255
 
229
256
 
257
+ @register_fake("torchcodec_ns::encode_video_to_file")
258
+ def encode_video_to_file_abstract(
259
+ frames: torch.Tensor,
260
+ frame_rate: int,
261
+ filename: str,
262
+ crf: Optional[int] = None,
263
+ ) -> None:
264
+ return
265
+
266
+
230
267
  @register_fake("torchcodec_ns::encode_audio_to_tensor")
231
268
  def encode_audio_to_tensor_abstract(
232
269
  samples: torch.Tensor,
@@ -239,6 +276,19 @@ def encode_audio_to_tensor_abstract(
239
276
  return torch.empty([], dtype=torch.long)
240
277
 
241
278
 
279
+ @register_fake("torchcodec_ns::_encode_audio_to_file_like")
280
+ def _encode_audio_to_file_like_abstract(
281
+ samples: torch.Tensor,
282
+ sample_rate: int,
283
+ format: str,
284
+ file_like_context: int,
285
+ bit_rate: Optional[int] = None,
286
+ num_channels: Optional[int] = None,
287
+ desired_sample_rate: Optional[int] = None,
288
+ ) -> None:
289
+ return
290
+
291
+
242
292
  @register_fake("torchcodec_ns::create_from_tensor")
243
293
  def create_from_tensor_abstract(
244
294
  video_tensor: torch.Tensor, seek_mode: Optional[str]
@@ -246,21 +296,16 @@ def create_from_tensor_abstract(
246
296
  return torch.empty([], dtype=torch.long)
247
297
 
248
298
 
249
- @register_fake("torchcodec_ns::_convert_to_tensor")
250
- def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor:
251
- return torch.empty([], dtype=torch.long)
252
-
253
-
254
299
  @register_fake("torchcodec_ns::_add_video_stream")
255
300
  def _add_video_stream_abstract(
256
301
  decoder: torch.Tensor,
257
302
  *,
258
- width: Optional[int] = None,
259
- height: Optional[int] = None,
260
303
  num_threads: Optional[int] = None,
261
304
  dimension_order: Optional[str] = None,
262
305
  stream_index: Optional[int] = None,
263
- device: Optional[str] = None,
306
+ device: str = "cpu",
307
+ device_variant: str = "default",
308
+ transform_specs: str = "",
264
309
  custom_frame_mappings: Optional[
265
310
  tuple[torch.Tensor, torch.Tensor, torch.Tensor]
266
311
  ] = None,
@@ -273,12 +318,12 @@ def _add_video_stream_abstract(
273
318
  def add_video_stream_abstract(
274
319
  decoder: torch.Tensor,
275
320
  *,
276
- width: Optional[int] = None,
277
- height: Optional[int] = None,
278
321
  num_threads: Optional[int] = None,
279
322
  dimension_order: Optional[str] = None,
280
323
  stream_index: Optional[int] = None,
281
- device: Optional[str] = None,
324
+ device: str = "cpu",
325
+ device_variant: str = "default",
326
+ transform_specs: str = "",
282
327
  custom_frame_mappings: Optional[
283
328
  tuple[torch.Tensor, torch.Tensor, torch.Tensor]
284
329
  ] = None,
@@ -332,7 +377,7 @@ def get_frame_at_pts_abstract(
332
377
  def get_frames_by_pts_abstract(
333
378
  decoder: torch.Tensor,
334
379
  *,
335
- timestamps: List[float],
380
+ timestamps: Union[torch.Tensor, List[float]],
336
381
  ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
337
382
  image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
338
383
  return (
@@ -356,9 +401,7 @@ def get_frame_at_index_abstract(
356
401
 
357
402
  @register_fake("torchcodec_ns::get_frames_at_indices")
358
403
  def get_frames_at_indices_abstract(
359
- decoder: torch.Tensor,
360
- *,
361
- frame_indices: List[int],
404
+ decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, List[int]]
362
405
  ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
363
406
  image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
364
407
  return (
@@ -7,72 +7,36 @@
7
7
  #include <pybind11/pybind11.h>
8
8
  #include <pybind11/stl.h>
9
9
  #include <cstdint>
10
- #include <string>
11
10
 
12
11
  #include "src/torchcodec/_core/AVIOFileLikeContext.h"
13
- #include "src/torchcodec/_core/Encoder.h"
14
- #include "src/torchcodec/_core/SingleStreamDecoder.h"
15
- #include "src/torchcodec/_core/StreamOptions.h"
16
- #include "src/torchcodec/_core/ValidationUtils.h"
17
12
 
18
13
  namespace py = pybind11;
19
14
 
20
15
  namespace facebook::torchcodec {
21
16
 
22
- // In principle, this should be able to return a tensor. But when we try that,
23
- // we run into the bug reported here:
17
+ // Note: It's not immediately obvous why we need both custom_ops.cpp and
18
+ // pybind_ops.cpp. We do all other Python to C++ bridging in
19
+ // custom_ops.cpp, and that even depends on pybind11, so why have an
20
+ // explicit pybind-only file?
24
21
  //
25
- // https://github.com/pytorch/pytorch/issues/136664
22
+ // The reason is that we want to accept OWNERSHIP of a file-like object
23
+ // from the Python side. In order to do that, we need a proper
24
+ // py::object. For raw bytes, we can launder that through a tensor on the
25
+ // custom_ops.cpp side, but we can't launder a proper Python object
26
+ // through a tensor. Custom ops can't accept a proper Python object
27
+ // through py::object, so we have to do direct pybind11 here.
26
28
  //
27
- // So we instead launder the pointer through an int, and then use a conversion
28
- // function on the custom ops side to launder that int into a tensor.
29
- int64_t create_from_file_like(
30
- py::object file_like,
31
- std::optional<std::string_view> seek_mode) {
32
- SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
33
- if (seek_mode.has_value()) {
34
- realSeek = seekModeFromString(seek_mode.value());
35
- }
36
-
37
- auto avioContextHolder =
38
- std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
39
-
40
- SingleStreamDecoder* decoder =
41
- new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
42
- return reinterpret_cast<int64_t>(decoder);
43
- }
44
-
45
- void encode_audio_to_file_like(
46
- int64_t data_ptr,
47
- const std::vector<int64_t>& shape,
48
- int64_t sample_rate,
49
- std::string_view format,
50
- py::object file_like,
51
- std::optional<int64_t> bit_rate = std::nullopt,
52
- std::optional<int64_t> num_channels = std::nullopt,
53
- std::optional<int64_t> desired_sample_rate = std::nullopt) {
54
- // We assume float32 *and* contiguity, this must be enforced by the caller.
55
- auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
56
- auto samples = torch::from_blob(
57
- reinterpret_cast<void*>(data_ptr), shape, tensor_options);
58
-
59
- AudioStreamOptions audioStreamOptions;
60
- audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
61
- audioStreamOptions.numChannels =
62
- validateOptionalInt64ToInt(num_channels, "num_channels");
63
- audioStreamOptions.sampleRate =
64
- validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
65
-
66
- auto avioContextHolder =
67
- std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
68
-
69
- AudioEncoder encoder(
70
- samples,
71
- validateInt64ToInt(sample_rate, "sample_rate"),
72
- format,
73
- std::move(avioContextHolder),
74
- audioStreamOptions);
75
- encoder.encode();
29
+ // TODO: Investigate if we can do something better here. See:
30
+ // https://github.com/pytorch/torchcodec/issues/896
31
+ // Short version is that we're laundering a pointer through an int, the
32
+ // Python side forwards that to decoder creation functions in
33
+ // custom_ops.cpp and we do another cast on that side to get a pointer
34
+ // again. We want to investigate if we can do something cleaner by
35
+ // defining proper pybind objects.
36
+ int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
37
+ AVIOFileLikeContext* context =
38
+ new AVIOFileLikeContext(file_like, is_for_writing);
39
+ return reinterpret_cast<int64_t>(context);
76
40
  }
77
41
 
78
42
  #ifndef PYBIND_OPS_MODULE_NAME
@@ -80,8 +44,7 @@ void encode_audio_to_file_like(
80
44
  #endif
81
45
 
82
46
  PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
83
- m.def("create_from_file_like", &create_from_file_like);
84
- m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
47
+ m.def("create_file_like_context", &create_file_like_context);
85
48
  }
86
49
 
87
50
  } // namespace facebook::torchcodec
@@ -105,25 +105,12 @@ class IndexBasedSamplerArgs(SamplerArgs):
105
105
  sample_step: int = 1
106
106
 
107
107
 
108
- class VideoClipSampler(nn.Module):
108
+ class DEPRECATED_VideoClipSampler(nn.Module):
109
109
  """
110
- VideoClipSampler will do video clip sampling with given video args and sampler args.
111
- The video args contains video related information, frames_per_clip, dimensions etc.
112
- The sampler args can be either time-based or index-based, it will be used to decide clip start time pts or index.
113
- ClipSampling support, random, uniform, periodic, target, keyframe sampling etc.
110
+ DEPRECATED: Do not use. The supported samplers are in `torchcodec.samplers`. See:
114
111
 
115
- Args:
116
- video_args (`VideoArgs`): The video args
117
- sampler_args (`SamplerArgs`): The sampler args. Can be TimeBasedSamplerArgs or IndexBasedSamplerArgs
118
- decoder_args (`DecoderArgs`): Decoder args contain value needs for decoder, for example, thread count
119
-
120
- Example:
121
- >>> video_args = VideoArgs(desired_width=224, desired_height=224)
122
- >>> time_based_sampler_args = TimeBasedSamplerArgs(sampler_type="random", clips_per_video=1, frames_per_clip=4)
123
- >>> video_decoder_args = DecoderArgs(num_threads=1)
124
- >>> video_clip_sampler = VideoClipSampler(video_args, time_based_sampler_args, decoder_args)
125
- >>> clips = video_clip_sampler(video_data)
126
- clips now contains a list of clip, where clip is a list of frame tensors, each tensor represents a frame image.
112
+ * https://docs.pytorch.org/torchcodec/stable/api_ref_torchcodec.html
113
+ * https://docs.pytorch.org/torchcodec/stable/generated_examples/decoding/sampling.html
127
114
  """
128
115
 
129
116
  def __init__(
@@ -160,8 +147,7 @@ class VideoClipSampler(nn.Module):
160
147
  scan_all_streams_to_update_metadata(video_decoder)
161
148
  add_video_stream(
162
149
  video_decoder,
163
- width=target_width,
164
- height=target_height,
150
+ transform_specs=f"resize, {target_height}, {target_width}",
165
151
  num_threads=self.decoder_args.num_threads,
166
152
  )
167
153
 
@@ -240,6 +226,8 @@ class VideoClipSampler(nn.Module):
240
226
  clip_start_idx + i * index_based_sampler_args.video_frame_dilation
241
227
  for i in range(index_based_sampler_args.frames_per_clip)
242
228
  ]
229
+ # Need torch.stack to convert List[Tensor[int]] into 1D Tensor[int]
230
+ batch_indexes = torch.stack(batch_indexes)
243
231
  frames, *_ = get_frames_at_indices(
244
232
  video_decoder,
245
233
  frame_indices=batch_indexes,
@@ -6,6 +6,7 @@
6
6
 
7
7
  from .._core import AudioStreamMetadata, VideoStreamMetadata
8
8
  from ._audio_decoder import AudioDecoder # noqa
9
+ from ._decoder_utils import set_cuda_backend # noqa
9
10
  from ._video_decoder import VideoDecoder # noqa
10
11
 
11
12
  SimpleVideoDecoder = VideoDecoder
@@ -4,10 +4,12 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ import contextvars
7
8
  import io
9
+ from contextlib import contextmanager
8
10
  from pathlib import Path
9
11
 
10
- from typing import Union
12
+ from typing import Generator, Union
11
13
 
12
14
  from torch import Tensor
13
15
  from torchcodec import _core as core
@@ -50,3 +52,61 @@ def create_decoder(
50
52
  "read(self, size: int) -> bytes and "
51
53
  "seek(self, offset: int, whence: int) -> int methods."
52
54
  )
55
+
56
+
57
+ # Thread-local and async-safe storage for the current CUDA backend
58
+ _CUDA_BACKEND: contextvars.ContextVar[str] = contextvars.ContextVar(
59
+ "_CUDA_BACKEND", default="ffmpeg"
60
+ )
61
+
62
+
63
+ @contextmanager
64
+ def set_cuda_backend(backend: str) -> Generator[None, None, None]:
65
+ """Context Manager to set the CUDA backend for :class:`~torchcodec.decoders.VideoDecoder`.
66
+
67
+ This context manager allows you to specify which CUDA backend implementation
68
+ to use when creating :class:`~torchcodec.decoders.VideoDecoder` instances
69
+ with CUDA devices.
70
+
71
+ .. note::
72
+ **We recommend trying the "beta" backend instead of the default "ffmpeg"
73
+ backend!** The beta backend is faster, and will eventually become the
74
+ default in future versions. It may have rough edges that we'll polish
75
+ over time, but it's already quite stable and ready for adoption. Let us
76
+ know what you think!
77
+
78
+ Only the creation of the decoder needs to be inside the context manager, the
79
+ decoding methods can be called outside of it. You still need to pass
80
+ ``device="cuda"`` when creating the
81
+ :class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
82
+ specified, this context manager will have no effect. See example below.
83
+
84
+ This is thread-safe and async-safe.
85
+
86
+ Args:
87
+ backend (str): The CUDA backend to use. Can be "ffmpeg" (default) or
88
+ "beta". We recommend trying "beta" as it's faster!
89
+
90
+ Example:
91
+ >>> with set_cuda_backend("beta"):
92
+ ... decoder = VideoDecoder("video.mp4", device="cuda")
93
+ ...
94
+ ... # Only the decoder creation needs to be part of the context manager.
95
+ ... # Decoder will now the beta CUDA implementation:
96
+ ... decoder.get_frame_at(0)
97
+ """
98
+ backend = backend.lower()
99
+ if backend not in ("ffmpeg", "beta"):
100
+ raise ValueError(
101
+ f"Invalid CUDA backend ({backend}). Supported values are 'ffmpeg' and 'beta'."
102
+ )
103
+
104
+ previous_state = _CUDA_BACKEND.set(backend)
105
+ try:
106
+ yield
107
+ finally:
108
+ _CUDA_BACKEND.reset(previous_state)
109
+
110
+
111
+ def _get_cuda_backend() -> str:
112
+ return _CUDA_BACKEND.get()
@@ -15,6 +15,7 @@ from torch import device as torch_device, Tensor
15
15
 
16
16
  from torchcodec import _core as core, Frame, FrameBatch
17
17
  from torchcodec.decoders._decoder_utils import (
18
+ _get_cuda_backend,
18
19
  create_decoder,
19
20
  ERROR_REPORTING_INSTRUCTIONS,
20
21
  )
@@ -55,6 +56,8 @@ class VideoDecoder:
55
56
  Passing 0 lets FFmpeg decide on the number of threads.
56
57
  Default: 1.
57
58
  device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
59
+ If you pass a CUDA device, we recommend trying the "beta" CUDA
60
+ backend which is faster! See :func:`~torchcodec.decoders.set_cuda_backend`.
58
61
  seek_mode (str, optional): Determines if frame access will be "exact" or
59
62
  "approximate". Exact guarantees that requesting frame i will always
60
63
  return frame i, but doing so requires an initial :term:`scan` of the
@@ -63,6 +66,27 @@ class VideoDecoder:
63
66
  probably is. Default: "exact".
64
67
  Read more about this parameter in:
65
68
  :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
69
+ custom_frame_mappings (str, bytes, or file-like object, optional):
70
+ Mapping of frames to their metadata, typically generated via ffprobe.
71
+ This enables accurate frame seeking without requiring a full video scan.
72
+ Do not set seek_mode when custom_frame_mappings is provided.
73
+ Expected JSON format:
74
+
75
+ .. code-block:: json
76
+
77
+ {
78
+ "frames": [
79
+ {
80
+ "pts": 0,
81
+ "duration": 1001,
82
+ "key_frame": 1
83
+ }
84
+ ]
85
+ }
86
+
87
+ Alternative field names "pkt_pts" and "pkt_duration" are also supported.
88
+ Read more about this parameter in:
89
+ :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
66
90
 
67
91
  Attributes:
68
92
  metadata (VideoStreamMetadata): Metadata of the video stream.
@@ -80,6 +104,9 @@ class VideoDecoder:
80
104
  num_ffmpeg_threads: int = 1,
81
105
  device: Optional[Union[str, torch_device]] = "cpu",
82
106
  seek_mode: Literal["exact", "approximate"] = "exact",
107
+ custom_frame_mappings: Optional[
108
+ Union[str, bytes, io.RawIOBase, io.BufferedReader]
109
+ ] = None,
83
110
  ):
84
111
  torch._C._log_api_usage_once("torchcodec.decoders.VideoDecoder")
85
112
  allowed_seek_modes = ("exact", "approximate")
@@ -89,7 +116,6 @@ class VideoDecoder:
89
116
  f"Supported values are {', '.join(allowed_seek_modes)}."
90
117
  )
91
118
 
92
- custom_frame_mappings = None
93
119
  # Validate seek_mode and custom_frame_mappings are not mismatched
94
120
  if custom_frame_mappings is not None and seek_mode == "approximate":
95
121
  raise ValueError(
@@ -120,12 +146,25 @@ class VideoDecoder:
120
146
  if isinstance(device, torch_device):
121
147
  device = str(device)
122
148
 
149
+ device_variant = _get_cuda_backend()
150
+ if device_variant == "ffmpeg":
151
+ # TODONVDEC P2 rename 'default' into 'ffmpeg' everywhere.
152
+ device_variant = "default"
153
+
154
+ # Legacy support for device="cuda:0:beta" syntax
155
+ # TODONVDEC P2: remove support for this everywhere. This will require
156
+ # updating our tests.
157
+ if device == "cuda:0:beta":
158
+ device = "cuda:0"
159
+ device_variant = "beta"
160
+
123
161
  core.add_video_stream(
124
162
  self._decoder,
125
163
  stream_index=stream_index,
126
164
  dimension_order=dimension_order,
127
165
  num_threads=num_ffmpeg_threads,
128
166
  device=device,
167
+ device_variant=device_variant,
129
168
  custom_frame_mappings=custom_frame_mappings_data,
130
169
  )
131
170
 
@@ -217,24 +256,20 @@ class VideoDecoder:
217
256
  duration_seconds=duration_seconds.item(),
218
257
  )
219
258
 
220
- def get_frames_at(self, indices: list[int]) -> FrameBatch:
259
+ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch:
221
260
  """Return frames at the given indices.
222
261
 
223
262
  Args:
224
- indices (list of int): The indices of the frames to retrieve.
263
+ indices (torch.Tensor or list of int): The indices of the frames to retrieve.
225
264
 
226
265
  Returns:
227
266
  FrameBatch: The frames at the given indices.
228
267
  """
229
- if isinstance(indices, torch.Tensor):
230
- # TODO we should avoid converting tensors to lists and just let the
231
- # core ops and C++ code natively accept tensors. See
232
- # https://github.com/pytorch/torchcodec/issues/879
233
- indices = indices.to(torch.int).tolist()
234
268
 
235
269
  data, pts_seconds, duration_seconds = core.get_frames_at_indices(
236
270
  self._decoder, frame_indices=indices
237
271
  )
272
+
238
273
  return FrameBatch(
239
274
  data=data,
240
275
  pts_seconds=pts_seconds,
@@ -298,20 +333,17 @@ class VideoDecoder:
298
333
  duration_seconds=duration_seconds.item(),
299
334
  )
300
335
 
301
- def get_frames_played_at(self, seconds: list[float]) -> FrameBatch:
336
+ def get_frames_played_at(
337
+ self, seconds: Union[torch.Tensor, list[float]]
338
+ ) -> FrameBatch:
302
339
  """Return frames played at the given timestamps in seconds.
303
340
 
304
341
  Args:
305
- seconds (list of float): The timestamps in seconds when the frames are played.
342
+ seconds (torch.Tensor or list of float): The timestamps in seconds when the frames are played.
306
343
 
307
344
  Returns:
308
345
  FrameBatch: The frames that are played at ``seconds``.
309
346
  """
310
- if isinstance(seconds, torch.Tensor):
311
- # TODO we should avoid converting tensors to lists and just let the
312
- # core ops and C++ code natively accept tensors. See
313
- # https://github.com/pytorch/torchcodec/issues/879
314
- seconds = seconds.to(torch.float).tolist()
315
347
 
316
348
  data, pts_seconds, duration_seconds = core.get_frames_by_pts(
317
349
  self._decoder, timestamps=seconds
@@ -454,11 +486,15 @@ def _read_custom_frame_mappings(
454
486
  "Invalid custom frame mappings. The 'pts'/'pkt_pts', 'duration'/'pkt_duration', and 'key_frame' keys are required in the frame metadata."
455
487
  )
456
488
 
457
- frame_data = [
458
- (float(frame[pts_key]), frame["key_frame"], float(frame[duration_key]))
459
- for frame in input_data["frames"]
460
- ]
461
- all_frames, is_key_frame, duration = map(torch.tensor, zip(*frame_data))
489
+ all_frames = torch.tensor(
490
+ [int(frame[pts_key]) for frame in input_data["frames"]], dtype=torch.int64
491
+ )
492
+ is_key_frame = torch.tensor(
493
+ [int(frame["key_frame"]) for frame in input_data["frames"]], dtype=torch.bool
494
+ )
495
+ duration = torch.tensor(
496
+ [int(frame[duration_key]) for frame in input_data["frames"]], dtype=torch.int64
497
+ )
462
498
  if not (len(all_frames) == len(is_key_frame) == len(duration)):
463
499
  raise ValueError("Mismatched lengths in frame index data")
464
500
  return all_frames, is_key_frame, duration
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file