torchcodec 0.7.0__cp313-cp313-win_amd64.whl → 0.8.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
- torchcodec/_core/CMakeLists.txt +36 -3
- torchcodec/_core/CUDACommon.cpp +315 -0
- torchcodec/_core/CUDACommon.h +46 -0
- torchcodec/_core/CpuDeviceInterface.cpp +189 -108
- torchcodec/_core/CpuDeviceInterface.h +81 -19
- torchcodec/_core/CudaDeviceInterface.cpp +211 -368
- torchcodec/_core/CudaDeviceInterface.h +33 -6
- torchcodec/_core/DeviceInterface.cpp +57 -19
- torchcodec/_core/DeviceInterface.h +97 -16
- torchcodec/_core/Encoder.cpp +302 -9
- torchcodec/_core/Encoder.h +51 -1
- torchcodec/_core/FFMPEGCommon.cpp +189 -2
- torchcodec/_core/FFMPEGCommon.h +18 -0
- torchcodec/_core/FilterGraph.cpp +28 -21
- torchcodec/_core/FilterGraph.h +15 -1
- torchcodec/_core/Frame.cpp +17 -7
- torchcodec/_core/Frame.h +15 -61
- torchcodec/_core/Metadata.h +2 -2
- torchcodec/_core/NVDECCache.cpp +70 -0
- torchcodec/_core/NVDECCache.h +104 -0
- torchcodec/_core/SingleStreamDecoder.cpp +202 -198
- torchcodec/_core/SingleStreamDecoder.h +39 -14
- torchcodec/_core/StreamOptions.h +16 -6
- torchcodec/_core/Transform.cpp +60 -0
- torchcodec/_core/Transform.h +59 -0
- torchcodec/_core/__init__.py +1 -0
- torchcodec/_core/custom_ops.cpp +180 -32
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +86 -43
- torchcodec/_core/pybind_ops.cpp +22 -59
- torchcodec/_samplers/video_clip_sampler.py +7 -19
- torchcodec/decoders/__init__.py +1 -0
- torchcodec/decoders/_decoder_utils.py +61 -1
- torchcodec/decoders/_video_decoder.py +56 -20
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/samplers/_time_based.py +8 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/METADATA +24 -13
- torchcodec-0.8.0.dist-info/RECORD +80 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/WHEEL +1 -1
- torchcodec-0.7.0.dist-info/RECORD +0 -67
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/top_level.txt +0 -0
torchcodec/_core/ops.py
CHANGED
|
@@ -41,7 +41,7 @@ def load_torchcodec_shared_libraries():
|
|
|
41
41
|
# libraries do not meet those conditions.
|
|
42
42
|
|
|
43
43
|
exceptions = []
|
|
44
|
-
for ffmpeg_major_version in (7, 6, 5, 4):
|
|
44
|
+
for ffmpeg_major_version in (8, 7, 6, 5, 4):
|
|
45
45
|
pybind_ops_module_name = _get_pybind_ops_module_name(ffmpeg_major_version)
|
|
46
46
|
decoder_library_name = f"libtorchcodec_core{ffmpeg_major_version}"
|
|
47
47
|
custom_ops_library_name = f"libtorchcodec_custom_ops{ffmpeg_major_version}"
|
|
@@ -92,14 +92,20 @@ create_from_file = torch._dynamo.disallow_in_graph(
|
|
|
92
92
|
encode_audio_to_file = torch._dynamo.disallow_in_graph(
|
|
93
93
|
torch.ops.torchcodec_ns.encode_audio_to_file.default
|
|
94
94
|
)
|
|
95
|
+
encode_video_to_file = torch._dynamo.disallow_in_graph(
|
|
96
|
+
torch.ops.torchcodec_ns.encode_video_to_file.default
|
|
97
|
+
)
|
|
95
98
|
encode_audio_to_tensor = torch._dynamo.disallow_in_graph(
|
|
96
99
|
torch.ops.torchcodec_ns.encode_audio_to_tensor.default
|
|
97
100
|
)
|
|
101
|
+
_encode_audio_to_file_like = torch._dynamo.disallow_in_graph(
|
|
102
|
+
torch.ops.torchcodec_ns._encode_audio_to_file_like.default
|
|
103
|
+
)
|
|
98
104
|
create_from_tensor = torch._dynamo.disallow_in_graph(
|
|
99
105
|
torch.ops.torchcodec_ns.create_from_tensor.default
|
|
100
106
|
)
|
|
101
|
-
|
|
102
|
-
torch.ops.torchcodec_ns.
|
|
107
|
+
_create_from_file_like = torch._dynamo.disallow_in_graph(
|
|
108
|
+
torch.ops.torchcodec_ns._create_from_file_like.default
|
|
103
109
|
)
|
|
104
110
|
add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default
|
|
105
111
|
_add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default
|
|
@@ -108,8 +114,10 @@ seek_to_pts = torch.ops.torchcodec_ns.seek_to_pts.default
|
|
|
108
114
|
get_next_frame = torch.ops.torchcodec_ns.get_next_frame.default
|
|
109
115
|
get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
|
|
110
116
|
get_frame_at_index = torch.ops.torchcodec_ns.get_frame_at_index.default
|
|
111
|
-
|
|
112
|
-
|
|
117
|
+
_get_frames_at_indices_tensor_input = (
|
|
118
|
+
torch.ops.torchcodec_ns.get_frames_at_indices.default
|
|
119
|
+
)
|
|
120
|
+
_get_frames_by_pts_tensor_input = torch.ops.torchcodec_ns.get_frames_by_pts.default
|
|
113
121
|
get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
|
|
114
122
|
get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
|
|
115
123
|
get_frames_by_pts_in_range_audio = (
|
|
@@ -148,7 +156,12 @@ def create_from_file_like(
|
|
|
148
156
|
file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None
|
|
149
157
|
) -> torch.Tensor:
|
|
150
158
|
assert _pybind_ops is not None
|
|
151
|
-
return
|
|
159
|
+
return _create_from_file_like(
|
|
160
|
+
_pybind_ops.create_file_like_context(
|
|
161
|
+
file_like, False # False means not for writing
|
|
162
|
+
),
|
|
163
|
+
seek_mode,
|
|
164
|
+
)
|
|
152
165
|
|
|
153
166
|
|
|
154
167
|
def encode_audio_to_file_like(
|
|
@@ -176,35 +189,42 @@ def encode_audio_to_file_like(
|
|
|
176
189
|
if samples.dtype != torch.float32:
|
|
177
190
|
raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
|
|
178
191
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
# error. In order to work around this, we pass the pointer to the tensor's
|
|
182
|
-
# data, and its shape, in order to re-construct it in C++. For this to work:
|
|
183
|
-
# - the tensor must be float32
|
|
184
|
-
# - the tensor must be contiguous, which is why we call contiguous().
|
|
185
|
-
# In theory we could avoid this restriction by also passing the strides?
|
|
186
|
-
# - IMPORTANT: the input samples tensor and its underlying data must be
|
|
187
|
-
# alive during the call.
|
|
188
|
-
#
|
|
189
|
-
# A more elegant solution would be to cast the tensor into a py::object, but
|
|
190
|
-
# casting the py::object backk to a tensor in C++ seems to lead to the same
|
|
191
|
-
# pybing error.
|
|
192
|
-
|
|
193
|
-
samples = samples.contiguous()
|
|
194
|
-
_pybind_ops.encode_audio_to_file_like(
|
|
195
|
-
samples.data_ptr(),
|
|
196
|
-
list(samples.shape),
|
|
192
|
+
_encode_audio_to_file_like(
|
|
193
|
+
samples,
|
|
197
194
|
sample_rate,
|
|
198
195
|
format,
|
|
199
|
-
file_like,
|
|
196
|
+
_pybind_ops.create_file_like_context(file_like, True), # True means for writing
|
|
200
197
|
bit_rate,
|
|
201
198
|
num_channels,
|
|
202
199
|
desired_sample_rate,
|
|
203
200
|
)
|
|
204
201
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
202
|
+
|
|
203
|
+
def get_frames_at_indices(
|
|
204
|
+
decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, list[int]]
|
|
205
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
206
|
+
if isinstance(frame_indices, torch.Tensor):
|
|
207
|
+
# Ensure indices is the correct dtype (int64)
|
|
208
|
+
frame_indices = frame_indices.to(torch.int64)
|
|
209
|
+
else:
|
|
210
|
+
# Convert list to tensor for dispatch
|
|
211
|
+
frame_indices = torch.tensor(frame_indices)
|
|
212
|
+
return _get_frames_at_indices_tensor_input(decoder, frame_indices=frame_indices)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def get_frames_by_pts(
|
|
216
|
+
decoder: torch.Tensor, *, timestamps: Union[torch.Tensor, list[float]]
|
|
217
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
218
|
+
if isinstance(timestamps, torch.Tensor):
|
|
219
|
+
# Ensure indices is the correct dtype (float64)
|
|
220
|
+
timestamps = timestamps.to(torch.float64)
|
|
221
|
+
else:
|
|
222
|
+
# Convert list to tensor for dispatch
|
|
223
|
+
try:
|
|
224
|
+
timestamps = torch.tensor(timestamps, dtype=torch.float64)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
raise ValueError("Couldn't convert timestamps input to a tensor") from e
|
|
227
|
+
return _get_frames_by_pts_tensor_input(decoder, timestamps=timestamps)
|
|
208
228
|
|
|
209
229
|
|
|
210
230
|
# ==============================
|
|
@@ -215,6 +235,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
|
|
|
215
235
|
return torch.empty([], dtype=torch.long)
|
|
216
236
|
|
|
217
237
|
|
|
238
|
+
@register_fake("torchcodec_ns::_create_from_file_like")
|
|
239
|
+
def _create_from_file_like_abstract(
|
|
240
|
+
file_like: int, seek_mode: Optional[str]
|
|
241
|
+
) -> torch.Tensor:
|
|
242
|
+
return torch.empty([], dtype=torch.long)
|
|
243
|
+
|
|
244
|
+
|
|
218
245
|
@register_fake("torchcodec_ns::encode_audio_to_file")
|
|
219
246
|
def encode_audio_to_file_abstract(
|
|
220
247
|
samples: torch.Tensor,
|
|
@@ -227,6 +254,16 @@ def encode_audio_to_file_abstract(
|
|
|
227
254
|
return
|
|
228
255
|
|
|
229
256
|
|
|
257
|
+
@register_fake("torchcodec_ns::encode_video_to_file")
|
|
258
|
+
def encode_video_to_file_abstract(
|
|
259
|
+
frames: torch.Tensor,
|
|
260
|
+
frame_rate: int,
|
|
261
|
+
filename: str,
|
|
262
|
+
crf: Optional[int] = None,
|
|
263
|
+
) -> None:
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
|
|
230
267
|
@register_fake("torchcodec_ns::encode_audio_to_tensor")
|
|
231
268
|
def encode_audio_to_tensor_abstract(
|
|
232
269
|
samples: torch.Tensor,
|
|
@@ -239,6 +276,19 @@ def encode_audio_to_tensor_abstract(
|
|
|
239
276
|
return torch.empty([], dtype=torch.long)
|
|
240
277
|
|
|
241
278
|
|
|
279
|
+
@register_fake("torchcodec_ns::_encode_audio_to_file_like")
|
|
280
|
+
def _encode_audio_to_file_like_abstract(
|
|
281
|
+
samples: torch.Tensor,
|
|
282
|
+
sample_rate: int,
|
|
283
|
+
format: str,
|
|
284
|
+
file_like_context: int,
|
|
285
|
+
bit_rate: Optional[int] = None,
|
|
286
|
+
num_channels: Optional[int] = None,
|
|
287
|
+
desired_sample_rate: Optional[int] = None,
|
|
288
|
+
) -> None:
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
|
|
242
292
|
@register_fake("torchcodec_ns::create_from_tensor")
|
|
243
293
|
def create_from_tensor_abstract(
|
|
244
294
|
video_tensor: torch.Tensor, seek_mode: Optional[str]
|
|
@@ -246,21 +296,16 @@ def create_from_tensor_abstract(
|
|
|
246
296
|
return torch.empty([], dtype=torch.long)
|
|
247
297
|
|
|
248
298
|
|
|
249
|
-
@register_fake("torchcodec_ns::_convert_to_tensor")
|
|
250
|
-
def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor:
|
|
251
|
-
return torch.empty([], dtype=torch.long)
|
|
252
|
-
|
|
253
|
-
|
|
254
299
|
@register_fake("torchcodec_ns::_add_video_stream")
|
|
255
300
|
def _add_video_stream_abstract(
|
|
256
301
|
decoder: torch.Tensor,
|
|
257
302
|
*,
|
|
258
|
-
width: Optional[int] = None,
|
|
259
|
-
height: Optional[int] = None,
|
|
260
303
|
num_threads: Optional[int] = None,
|
|
261
304
|
dimension_order: Optional[str] = None,
|
|
262
305
|
stream_index: Optional[int] = None,
|
|
263
|
-
device:
|
|
306
|
+
device: str = "cpu",
|
|
307
|
+
device_variant: str = "default",
|
|
308
|
+
transform_specs: str = "",
|
|
264
309
|
custom_frame_mappings: Optional[
|
|
265
310
|
tuple[torch.Tensor, torch.Tensor, torch.Tensor]
|
|
266
311
|
] = None,
|
|
@@ -273,12 +318,12 @@ def _add_video_stream_abstract(
|
|
|
273
318
|
def add_video_stream_abstract(
|
|
274
319
|
decoder: torch.Tensor,
|
|
275
320
|
*,
|
|
276
|
-
width: Optional[int] = None,
|
|
277
|
-
height: Optional[int] = None,
|
|
278
321
|
num_threads: Optional[int] = None,
|
|
279
322
|
dimension_order: Optional[str] = None,
|
|
280
323
|
stream_index: Optional[int] = None,
|
|
281
|
-
device:
|
|
324
|
+
device: str = "cpu",
|
|
325
|
+
device_variant: str = "default",
|
|
326
|
+
transform_specs: str = "",
|
|
282
327
|
custom_frame_mappings: Optional[
|
|
283
328
|
tuple[torch.Tensor, torch.Tensor, torch.Tensor]
|
|
284
329
|
] = None,
|
|
@@ -332,7 +377,7 @@ def get_frame_at_pts_abstract(
|
|
|
332
377
|
def get_frames_by_pts_abstract(
|
|
333
378
|
decoder: torch.Tensor,
|
|
334
379
|
*,
|
|
335
|
-
timestamps: List[float],
|
|
380
|
+
timestamps: Union[torch.Tensor, List[float]],
|
|
336
381
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
337
382
|
image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
|
|
338
383
|
return (
|
|
@@ -356,9 +401,7 @@ def get_frame_at_index_abstract(
|
|
|
356
401
|
|
|
357
402
|
@register_fake("torchcodec_ns::get_frames_at_indices")
|
|
358
403
|
def get_frames_at_indices_abstract(
|
|
359
|
-
decoder: torch.Tensor,
|
|
360
|
-
*,
|
|
361
|
-
frame_indices: List[int],
|
|
404
|
+
decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, List[int]]
|
|
362
405
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
363
406
|
image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
|
|
364
407
|
return (
|
torchcodec/_core/pybind_ops.cpp
CHANGED
|
@@ -7,72 +7,36 @@
|
|
|
7
7
|
#include <pybind11/pybind11.h>
|
|
8
8
|
#include <pybind11/stl.h>
|
|
9
9
|
#include <cstdint>
|
|
10
|
-
#include <string>
|
|
11
10
|
|
|
12
11
|
#include "src/torchcodec/_core/AVIOFileLikeContext.h"
|
|
13
|
-
#include "src/torchcodec/_core/Encoder.h"
|
|
14
|
-
#include "src/torchcodec/_core/SingleStreamDecoder.h"
|
|
15
|
-
#include "src/torchcodec/_core/StreamOptions.h"
|
|
16
|
-
#include "src/torchcodec/_core/ValidationUtils.h"
|
|
17
12
|
|
|
18
13
|
namespace py = pybind11;
|
|
19
14
|
|
|
20
15
|
namespace facebook::torchcodec {
|
|
21
16
|
|
|
22
|
-
//
|
|
23
|
-
//
|
|
17
|
+
// Note: It's not immediately obvous why we need both custom_ops.cpp and
|
|
18
|
+
// pybind_ops.cpp. We do all other Python to C++ bridging in
|
|
19
|
+
// custom_ops.cpp, and that even depends on pybind11, so why have an
|
|
20
|
+
// explicit pybind-only file?
|
|
24
21
|
//
|
|
25
|
-
//
|
|
22
|
+
// The reason is that we want to accept OWNERSHIP of a file-like object
|
|
23
|
+
// from the Python side. In order to do that, we need a proper
|
|
24
|
+
// py::object. For raw bytes, we can launder that through a tensor on the
|
|
25
|
+
// custom_ops.cpp side, but we can't launder a proper Python object
|
|
26
|
+
// through a tensor. Custom ops can't accept a proper Python object
|
|
27
|
+
// through py::object, so we have to do direct pybind11 here.
|
|
26
28
|
//
|
|
27
|
-
//
|
|
28
|
-
//
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
|
|
39
|
-
|
|
40
|
-
SingleStreamDecoder* decoder =
|
|
41
|
-
new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
|
|
42
|
-
return reinterpret_cast<int64_t>(decoder);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
void encode_audio_to_file_like(
|
|
46
|
-
int64_t data_ptr,
|
|
47
|
-
const std::vector<int64_t>& shape,
|
|
48
|
-
int64_t sample_rate,
|
|
49
|
-
std::string_view format,
|
|
50
|
-
py::object file_like,
|
|
51
|
-
std::optional<int64_t> bit_rate = std::nullopt,
|
|
52
|
-
std::optional<int64_t> num_channels = std::nullopt,
|
|
53
|
-
std::optional<int64_t> desired_sample_rate = std::nullopt) {
|
|
54
|
-
// We assume float32 *and* contiguity, this must be enforced by the caller.
|
|
55
|
-
auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
|
|
56
|
-
auto samples = torch::from_blob(
|
|
57
|
-
reinterpret_cast<void*>(data_ptr), shape, tensor_options);
|
|
58
|
-
|
|
59
|
-
AudioStreamOptions audioStreamOptions;
|
|
60
|
-
audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
|
|
61
|
-
audioStreamOptions.numChannels =
|
|
62
|
-
validateOptionalInt64ToInt(num_channels, "num_channels");
|
|
63
|
-
audioStreamOptions.sampleRate =
|
|
64
|
-
validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
|
|
65
|
-
|
|
66
|
-
auto avioContextHolder =
|
|
67
|
-
std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
|
|
68
|
-
|
|
69
|
-
AudioEncoder encoder(
|
|
70
|
-
samples,
|
|
71
|
-
validateInt64ToInt(sample_rate, "sample_rate"),
|
|
72
|
-
format,
|
|
73
|
-
std::move(avioContextHolder),
|
|
74
|
-
audioStreamOptions);
|
|
75
|
-
encoder.encode();
|
|
29
|
+
// TODO: Investigate if we can do something better here. See:
|
|
30
|
+
// https://github.com/pytorch/torchcodec/issues/896
|
|
31
|
+
// Short version is that we're laundering a pointer through an int, the
|
|
32
|
+
// Python side forwards that to decoder creation functions in
|
|
33
|
+
// custom_ops.cpp and we do another cast on that side to get a pointer
|
|
34
|
+
// again. We want to investigate if we can do something cleaner by
|
|
35
|
+
// defining proper pybind objects.
|
|
36
|
+
int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
|
|
37
|
+
AVIOFileLikeContext* context =
|
|
38
|
+
new AVIOFileLikeContext(file_like, is_for_writing);
|
|
39
|
+
return reinterpret_cast<int64_t>(context);
|
|
76
40
|
}
|
|
77
41
|
|
|
78
42
|
#ifndef PYBIND_OPS_MODULE_NAME
|
|
@@ -80,8 +44,7 @@ void encode_audio_to_file_like(
|
|
|
80
44
|
#endif
|
|
81
45
|
|
|
82
46
|
PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
|
|
83
|
-
m.def("
|
|
84
|
-
m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
|
|
47
|
+
m.def("create_file_like_context", &create_file_like_context);
|
|
85
48
|
}
|
|
86
49
|
|
|
87
50
|
} // namespace facebook::torchcodec
|
|
@@ -105,25 +105,12 @@ class IndexBasedSamplerArgs(SamplerArgs):
|
|
|
105
105
|
sample_step: int = 1
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
class
|
|
108
|
+
class DEPRECATED_VideoClipSampler(nn.Module):
|
|
109
109
|
"""
|
|
110
|
-
|
|
111
|
-
The video args contains video related information, frames_per_clip, dimensions etc.
|
|
112
|
-
The sampler args can be either time-based or index-based, it will be used to decide clip start time pts or index.
|
|
113
|
-
ClipSampling support, random, uniform, periodic, target, keyframe sampling etc.
|
|
110
|
+
DEPRECATED: Do not use. The supported samplers are in `torchcodec.samplers`. See:
|
|
114
111
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
sampler_args (`SamplerArgs`): The sampler args. Can be TimeBasedSamplerArgs or IndexBasedSamplerArgs
|
|
118
|
-
decoder_args (`DecoderArgs`): Decoder args contain value needs for decoder, for example, thread count
|
|
119
|
-
|
|
120
|
-
Example:
|
|
121
|
-
>>> video_args = VideoArgs(desired_width=224, desired_height=224)
|
|
122
|
-
>>> time_based_sampler_args = TimeBasedSamplerArgs(sampler_type="random", clips_per_video=1, frames_per_clip=4)
|
|
123
|
-
>>> video_decoder_args = DecoderArgs(num_threads=1)
|
|
124
|
-
>>> video_clip_sampler = VideoClipSampler(video_args, time_based_sampler_args, decoder_args)
|
|
125
|
-
>>> clips = video_clip_sampler(video_data)
|
|
126
|
-
clips now contains a list of clip, where clip is a list of frame tensors, each tensor represents a frame image.
|
|
112
|
+
* https://docs.pytorch.org/torchcodec/stable/api_ref_torchcodec.html
|
|
113
|
+
* https://docs.pytorch.org/torchcodec/stable/generated_examples/decoding/sampling.html
|
|
127
114
|
"""
|
|
128
115
|
|
|
129
116
|
def __init__(
|
|
@@ -160,8 +147,7 @@ class VideoClipSampler(nn.Module):
|
|
|
160
147
|
scan_all_streams_to_update_metadata(video_decoder)
|
|
161
148
|
add_video_stream(
|
|
162
149
|
video_decoder,
|
|
163
|
-
|
|
164
|
-
height=target_height,
|
|
150
|
+
transform_specs=f"resize, {target_height}, {target_width}",
|
|
165
151
|
num_threads=self.decoder_args.num_threads,
|
|
166
152
|
)
|
|
167
153
|
|
|
@@ -240,6 +226,8 @@ class VideoClipSampler(nn.Module):
|
|
|
240
226
|
clip_start_idx + i * index_based_sampler_args.video_frame_dilation
|
|
241
227
|
for i in range(index_based_sampler_args.frames_per_clip)
|
|
242
228
|
]
|
|
229
|
+
# Need torch.stack to convert List[Tensor[int]] into 1D Tensor[int]
|
|
230
|
+
batch_indexes = torch.stack(batch_indexes)
|
|
243
231
|
frames, *_ = get_frames_at_indices(
|
|
244
232
|
video_decoder,
|
|
245
233
|
frame_indices=batch_indexes,
|
torchcodec/decoders/__init__.py
CHANGED
|
@@ -4,10 +4,12 @@
|
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
import contextvars
|
|
7
8
|
import io
|
|
9
|
+
from contextlib import contextmanager
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
|
|
10
|
-
from typing import Union
|
|
12
|
+
from typing import Generator, Union
|
|
11
13
|
|
|
12
14
|
from torch import Tensor
|
|
13
15
|
from torchcodec import _core as core
|
|
@@ -50,3 +52,61 @@ def create_decoder(
|
|
|
50
52
|
"read(self, size: int) -> bytes and "
|
|
51
53
|
"seek(self, offset: int, whence: int) -> int methods."
|
|
52
54
|
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Thread-local and async-safe storage for the current CUDA backend
|
|
58
|
+
_CUDA_BACKEND: contextvars.ContextVar[str] = contextvars.ContextVar(
|
|
59
|
+
"_CUDA_BACKEND", default="ffmpeg"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@contextmanager
|
|
64
|
+
def set_cuda_backend(backend: str) -> Generator[None, None, None]:
|
|
65
|
+
"""Context Manager to set the CUDA backend for :class:`~torchcodec.decoders.VideoDecoder`.
|
|
66
|
+
|
|
67
|
+
This context manager allows you to specify which CUDA backend implementation
|
|
68
|
+
to use when creating :class:`~torchcodec.decoders.VideoDecoder` instances
|
|
69
|
+
with CUDA devices.
|
|
70
|
+
|
|
71
|
+
.. note::
|
|
72
|
+
**We recommend trying the "beta" backend instead of the default "ffmpeg"
|
|
73
|
+
backend!** The beta backend is faster, and will eventually become the
|
|
74
|
+
default in future versions. It may have rough edges that we'll polish
|
|
75
|
+
over time, but it's already quite stable and ready for adoption. Let us
|
|
76
|
+
know what you think!
|
|
77
|
+
|
|
78
|
+
Only the creation of the decoder needs to be inside the context manager, the
|
|
79
|
+
decoding methods can be called outside of it. You still need to pass
|
|
80
|
+
``device="cuda"`` when creating the
|
|
81
|
+
:class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
|
|
82
|
+
specified, this context manager will have no effect. See example below.
|
|
83
|
+
|
|
84
|
+
This is thread-safe and async-safe.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
backend (str): The CUDA backend to use. Can be "ffmpeg" (default) or
|
|
88
|
+
"beta". We recommend trying "beta" as it's faster!
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> with set_cuda_backend("beta"):
|
|
92
|
+
... decoder = VideoDecoder("video.mp4", device="cuda")
|
|
93
|
+
...
|
|
94
|
+
... # Only the decoder creation needs to be part of the context manager.
|
|
95
|
+
... # Decoder will now the beta CUDA implementation:
|
|
96
|
+
... decoder.get_frame_at(0)
|
|
97
|
+
"""
|
|
98
|
+
backend = backend.lower()
|
|
99
|
+
if backend not in ("ffmpeg", "beta"):
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Invalid CUDA backend ({backend}). Supported values are 'ffmpeg' and 'beta'."
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
previous_state = _CUDA_BACKEND.set(backend)
|
|
105
|
+
try:
|
|
106
|
+
yield
|
|
107
|
+
finally:
|
|
108
|
+
_CUDA_BACKEND.reset(previous_state)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _get_cuda_backend() -> str:
|
|
112
|
+
return _CUDA_BACKEND.get()
|
|
@@ -15,6 +15,7 @@ from torch import device as torch_device, Tensor
|
|
|
15
15
|
|
|
16
16
|
from torchcodec import _core as core, Frame, FrameBatch
|
|
17
17
|
from torchcodec.decoders._decoder_utils import (
|
|
18
|
+
_get_cuda_backend,
|
|
18
19
|
create_decoder,
|
|
19
20
|
ERROR_REPORTING_INSTRUCTIONS,
|
|
20
21
|
)
|
|
@@ -55,6 +56,8 @@ class VideoDecoder:
|
|
|
55
56
|
Passing 0 lets FFmpeg decide on the number of threads.
|
|
56
57
|
Default: 1.
|
|
57
58
|
device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
|
|
59
|
+
If you pass a CUDA device, we recommend trying the "beta" CUDA
|
|
60
|
+
backend which is faster! See :func:`~torchcodec.decoders.set_cuda_backend`.
|
|
58
61
|
seek_mode (str, optional): Determines if frame access will be "exact" or
|
|
59
62
|
"approximate". Exact guarantees that requesting frame i will always
|
|
60
63
|
return frame i, but doing so requires an initial :term:`scan` of the
|
|
@@ -63,6 +66,27 @@ class VideoDecoder:
|
|
|
63
66
|
probably is. Default: "exact".
|
|
64
67
|
Read more about this parameter in:
|
|
65
68
|
:ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
|
|
69
|
+
custom_frame_mappings (str, bytes, or file-like object, optional):
|
|
70
|
+
Mapping of frames to their metadata, typically generated via ffprobe.
|
|
71
|
+
This enables accurate frame seeking without requiring a full video scan.
|
|
72
|
+
Do not set seek_mode when custom_frame_mappings is provided.
|
|
73
|
+
Expected JSON format:
|
|
74
|
+
|
|
75
|
+
.. code-block:: json
|
|
76
|
+
|
|
77
|
+
{
|
|
78
|
+
"frames": [
|
|
79
|
+
{
|
|
80
|
+
"pts": 0,
|
|
81
|
+
"duration": 1001,
|
|
82
|
+
"key_frame": 1
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
Alternative field names "pkt_pts" and "pkt_duration" are also supported.
|
|
88
|
+
Read more about this parameter in:
|
|
89
|
+
:ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
|
|
66
90
|
|
|
67
91
|
Attributes:
|
|
68
92
|
metadata (VideoStreamMetadata): Metadata of the video stream.
|
|
@@ -80,6 +104,9 @@ class VideoDecoder:
|
|
|
80
104
|
num_ffmpeg_threads: int = 1,
|
|
81
105
|
device: Optional[Union[str, torch_device]] = "cpu",
|
|
82
106
|
seek_mode: Literal["exact", "approximate"] = "exact",
|
|
107
|
+
custom_frame_mappings: Optional[
|
|
108
|
+
Union[str, bytes, io.RawIOBase, io.BufferedReader]
|
|
109
|
+
] = None,
|
|
83
110
|
):
|
|
84
111
|
torch._C._log_api_usage_once("torchcodec.decoders.VideoDecoder")
|
|
85
112
|
allowed_seek_modes = ("exact", "approximate")
|
|
@@ -89,7 +116,6 @@ class VideoDecoder:
|
|
|
89
116
|
f"Supported values are {', '.join(allowed_seek_modes)}."
|
|
90
117
|
)
|
|
91
118
|
|
|
92
|
-
custom_frame_mappings = None
|
|
93
119
|
# Validate seek_mode and custom_frame_mappings are not mismatched
|
|
94
120
|
if custom_frame_mappings is not None and seek_mode == "approximate":
|
|
95
121
|
raise ValueError(
|
|
@@ -120,12 +146,25 @@ class VideoDecoder:
|
|
|
120
146
|
if isinstance(device, torch_device):
|
|
121
147
|
device = str(device)
|
|
122
148
|
|
|
149
|
+
device_variant = _get_cuda_backend()
|
|
150
|
+
if device_variant == "ffmpeg":
|
|
151
|
+
# TODONVDEC P2 rename 'default' into 'ffmpeg' everywhere.
|
|
152
|
+
device_variant = "default"
|
|
153
|
+
|
|
154
|
+
# Legacy support for device="cuda:0:beta" syntax
|
|
155
|
+
# TODONVDEC P2: remove support for this everywhere. This will require
|
|
156
|
+
# updating our tests.
|
|
157
|
+
if device == "cuda:0:beta":
|
|
158
|
+
device = "cuda:0"
|
|
159
|
+
device_variant = "beta"
|
|
160
|
+
|
|
123
161
|
core.add_video_stream(
|
|
124
162
|
self._decoder,
|
|
125
163
|
stream_index=stream_index,
|
|
126
164
|
dimension_order=dimension_order,
|
|
127
165
|
num_threads=num_ffmpeg_threads,
|
|
128
166
|
device=device,
|
|
167
|
+
device_variant=device_variant,
|
|
129
168
|
custom_frame_mappings=custom_frame_mappings_data,
|
|
130
169
|
)
|
|
131
170
|
|
|
@@ -217,24 +256,20 @@ class VideoDecoder:
|
|
|
217
256
|
duration_seconds=duration_seconds.item(),
|
|
218
257
|
)
|
|
219
258
|
|
|
220
|
-
def get_frames_at(self, indices: list[int]) -> FrameBatch:
|
|
259
|
+
def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch:
|
|
221
260
|
"""Return frames at the given indices.
|
|
222
261
|
|
|
223
262
|
Args:
|
|
224
|
-
indices (list of int): The indices of the frames to retrieve.
|
|
263
|
+
indices (torch.Tensor or list of int): The indices of the frames to retrieve.
|
|
225
264
|
|
|
226
265
|
Returns:
|
|
227
266
|
FrameBatch: The frames at the given indices.
|
|
228
267
|
"""
|
|
229
|
-
if isinstance(indices, torch.Tensor):
|
|
230
|
-
# TODO we should avoid converting tensors to lists and just let the
|
|
231
|
-
# core ops and C++ code natively accept tensors. See
|
|
232
|
-
# https://github.com/pytorch/torchcodec/issues/879
|
|
233
|
-
indices = indices.to(torch.int).tolist()
|
|
234
268
|
|
|
235
269
|
data, pts_seconds, duration_seconds = core.get_frames_at_indices(
|
|
236
270
|
self._decoder, frame_indices=indices
|
|
237
271
|
)
|
|
272
|
+
|
|
238
273
|
return FrameBatch(
|
|
239
274
|
data=data,
|
|
240
275
|
pts_seconds=pts_seconds,
|
|
@@ -298,20 +333,17 @@ class VideoDecoder:
|
|
|
298
333
|
duration_seconds=duration_seconds.item(),
|
|
299
334
|
)
|
|
300
335
|
|
|
301
|
-
def get_frames_played_at(
|
|
336
|
+
def get_frames_played_at(
|
|
337
|
+
self, seconds: Union[torch.Tensor, list[float]]
|
|
338
|
+
) -> FrameBatch:
|
|
302
339
|
"""Return frames played at the given timestamps in seconds.
|
|
303
340
|
|
|
304
341
|
Args:
|
|
305
|
-
seconds (list of float): The timestamps in seconds when the frames are played.
|
|
342
|
+
seconds (torch.Tensor or list of float): The timestamps in seconds when the frames are played.
|
|
306
343
|
|
|
307
344
|
Returns:
|
|
308
345
|
FrameBatch: The frames that are played at ``seconds``.
|
|
309
346
|
"""
|
|
310
|
-
if isinstance(seconds, torch.Tensor):
|
|
311
|
-
# TODO we should avoid converting tensors to lists and just let the
|
|
312
|
-
# core ops and C++ code natively accept tensors. See
|
|
313
|
-
# https://github.com/pytorch/torchcodec/issues/879
|
|
314
|
-
seconds = seconds.to(torch.float).tolist()
|
|
315
347
|
|
|
316
348
|
data, pts_seconds, duration_seconds = core.get_frames_by_pts(
|
|
317
349
|
self._decoder, timestamps=seconds
|
|
@@ -454,11 +486,15 @@ def _read_custom_frame_mappings(
|
|
|
454
486
|
"Invalid custom frame mappings. The 'pts'/'pkt_pts', 'duration'/'pkt_duration', and 'key_frame' keys are required in the frame metadata."
|
|
455
487
|
)
|
|
456
488
|
|
|
457
|
-
|
|
458
|
-
(
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
489
|
+
all_frames = torch.tensor(
|
|
490
|
+
[int(frame[pts_key]) for frame in input_data["frames"]], dtype=torch.int64
|
|
491
|
+
)
|
|
492
|
+
is_key_frame = torch.tensor(
|
|
493
|
+
[int(frame["key_frame"]) for frame in input_data["frames"]], dtype=torch.bool
|
|
494
|
+
)
|
|
495
|
+
duration = torch.tensor(
|
|
496
|
+
[int(frame[duration_key]) for frame in input_data["frames"]], dtype=torch.int64
|
|
497
|
+
)
|
|
462
498
|
if not (len(all_frames) == len(is_key_frame) == len(duration)):
|
|
463
499
|
raise ValueError("Mismatched lengths in frame index data")
|
|
464
500
|
return all_frames, is_key_frame, duration
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|