torchcodec 0.10.0__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchcodec/__init__.py +27 -0
- torchcodec/_core/AVIOContextHolder.cpp +60 -0
- torchcodec/_core/AVIOContextHolder.h +64 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
- torchcodec/_core/AVIOFileLikeContext.h +55 -0
- torchcodec/_core/AVIOTensorContext.cpp +130 -0
- torchcodec/_core/AVIOTensorContext.h +44 -0
- torchcodec/_core/BetaCudaDeviceInterface.cpp +849 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +196 -0
- torchcodec/_core/CMakeLists.txt +295 -0
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +124 -0
- torchcodec/_core/CpuDeviceInterface.cpp +509 -0
- torchcodec/_core/CpuDeviceInterface.h +141 -0
- torchcodec/_core/CudaDeviceInterface.cpp +602 -0
- torchcodec/_core/CudaDeviceInterface.h +79 -0
- torchcodec/_core/DeviceInterface.cpp +117 -0
- torchcodec/_core/DeviceInterface.h +191 -0
- torchcodec/_core/Encoder.cpp +1054 -0
- torchcodec/_core/Encoder.h +192 -0
- torchcodec/_core/FFMPEGCommon.cpp +684 -0
- torchcodec/_core/FFMPEGCommon.h +314 -0
- torchcodec/_core/FilterGraph.cpp +159 -0
- torchcodec/_core/FilterGraph.h +59 -0
- torchcodec/_core/Frame.cpp +47 -0
- torchcodec/_core/Frame.h +72 -0
- torchcodec/_core/Metadata.cpp +124 -0
- torchcodec/_core/Metadata.h +92 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1586 -0
- torchcodec/_core/SingleStreamDecoder.h +391 -0
- torchcodec/_core/StreamOptions.h +70 -0
- torchcodec/_core/Transform.cpp +128 -0
- torchcodec/_core/Transform.h +86 -0
- torchcodec/_core/ValidationUtils.cpp +35 -0
- torchcodec/_core/ValidationUtils.h +21 -0
- torchcodec/_core/__init__.py +46 -0
- torchcodec/_core/_metadata.py +262 -0
- torchcodec/_core/custom_ops.cpp +1090 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +169 -0
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +605 -0
- torchcodec/_core/pybind_ops.cpp +50 -0
- torchcodec/_frame.py +146 -0
- torchcodec/_internally_replaced_utils.py +68 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +419 -0
- torchcodec/decoders/__init__.py +12 -0
- torchcodec/decoders/_audio_decoder.py +185 -0
- torchcodec/decoders/_decoder_utils.py +113 -0
- torchcodec/decoders/_video_decoder.py +601 -0
- torchcodec/encoders/__init__.py +2 -0
- torchcodec/encoders/_audio_encoder.py +149 -0
- torchcodec/encoders/_video_encoder.py +196 -0
- torchcodec/libtorchcodec_core4.so +0 -0
- torchcodec/libtorchcodec_core5.so +0 -0
- torchcodec/libtorchcodec_core6.so +0 -0
- torchcodec/libtorchcodec_core7.so +0 -0
- torchcodec/libtorchcodec_core8.so +0 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_custom_ops8.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops8.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +287 -0
- torchcodec/samplers/_time_based.py +358 -0
- torchcodec/share/cmake/TorchCodec/TorchCodecConfig.cmake +76 -0
- torchcodec/share/cmake/TorchCodec/ffmpeg_versions.cmake +122 -0
- torchcodec/transforms/__init__.py +12 -0
- torchcodec/transforms/_decoder_transforms.py +375 -0
- torchcodec/version.py +2 -0
- torchcodec-0.10.0.dist-info/METADATA +286 -0
- torchcodec-0.10.0.dist-info/RECORD +88 -0
- torchcodec-0.10.0.dist-info/WHEEL +5 -0
- torchcodec-0.10.0.dist-info/licenses/LICENSE +28 -0
- torchcodec-0.10.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# FindTorchCodec
|
|
2
|
+
# --------------
|
|
3
|
+
#
|
|
4
|
+
# Finds the TorchCodec library
|
|
5
|
+
#
|
|
6
|
+
# This will define the following variables:
|
|
7
|
+
#
|
|
8
|
+
# TORCHCODEC_FOUND: True if the system has the TorchCodec library
|
|
9
|
+
# TORCHCODEC_VARIANTS: list of TorchCodec variants. A variant is a supported
|
|
10
|
+
# FFmpeg major version.
|
|
11
|
+
#
|
|
12
|
+
# and the following imported targets:
|
|
13
|
+
#
|
|
14
|
+
# torchcodec::ffmpeg${N}
|
|
15
|
+
# torchcodec::core${N}
|
|
16
|
+
#
|
|
17
|
+
# where N is a TorchCodec variant (FFmpeg major version) from
|
|
18
|
+
# TORCHCODEC_VARIANTS list.
|
|
19
|
+
|
|
20
|
+
include(FindPackageHandleStandardArgs)
|
|
21
|
+
include("${CMAKE_CURRENT_LIST_DIR}/ffmpeg_versions.cmake")
|
|
22
|
+
|
|
23
|
+
# Assume we are in <install-prefix>/share/cmake/TorchCodec/TorchCodecConfig.cmake
|
|
24
|
+
get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
|
|
25
|
+
get_filename_component(TORCHCODEC_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
|
|
26
|
+
|
|
27
|
+
# Include directories.
|
|
28
|
+
set(TORCHCODEC_INCLUDE_DIRS ${TORCHCODEC_INSTALL_PREFIX}/_core)
|
|
29
|
+
set(TORCHCODEC_VARIANTS "")
|
|
30
|
+
|
|
31
|
+
function(add_torchcodec_target ffmpeg_major_version)
|
|
32
|
+
set(target torchcodec::core${ffmpeg_major_version})
|
|
33
|
+
|
|
34
|
+
if (NOT TARGET torchcodec::ffmpeg${ffmpeg_major_version})
|
|
35
|
+
message(FATAL_ERROR "torchcodec::ffmpeg${ffmpeg_major_version} target is not defined")
|
|
36
|
+
endif()
|
|
37
|
+
|
|
38
|
+
find_library(lib_path torchcodec_core${ffmpeg_major_version}
|
|
39
|
+
PATHS "${TORCHCODEC_INSTALL_PREFIX}" NO_CACHE NO_DEFAULT_PATH)
|
|
40
|
+
if (NOT lib_path)
|
|
41
|
+
message(FATAL_ERROR "torchcodec_core${ffmpeg_major_version} shared library is missing")
|
|
42
|
+
endif()
|
|
43
|
+
|
|
44
|
+
message("Adding ${target} target")
|
|
45
|
+
add_library(${target} SHARED IMPORTED)
|
|
46
|
+
add_dependencies(${target} torchcodec::ffmpeg${ffmpeg_major_version})
|
|
47
|
+
set_target_properties(${target} PROPERTIES
|
|
48
|
+
INTERFACE_INCLUDE_DIRECTORIES ${TORCHCODEC_INCLUDE_DIRS}
|
|
49
|
+
IMPORTED_LOCATION ${lib_path}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
list(APPEND TORCHCODEC_VARIANTS "${ffmpeg_major_version}")
|
|
53
|
+
set(TORCHCODEC_VARIANTS "${TORCHCODEC_VARIANTS}" PARENT_SCOPE)
|
|
54
|
+
endfunction()
|
|
55
|
+
|
|
56
|
+
# If any of the TORCHCODEC_FFMPEG${N}_INSTALL_PREFIX environment variables
|
|
57
|
+
# are defined, use them to locate the corresponding FFmpeg and TorchCodec targets.
|
|
58
|
+
# Otherwise, fall back to pkg-config to find FFmpeg.
|
|
59
|
+
set(use_pkg_config TRUE)
|
|
60
|
+
foreach(ffmpeg_major_version IN LISTS TORCHCODEC_SUPPORTED_FFMPEG_VERSIONS)
|
|
61
|
+
if (DEFINED ENV{TORCHCODEC_FFMPEG${ffmpeg_major_version}_INSTALL_PREFIX})
|
|
62
|
+
add_ffmpeg_target(
|
|
63
|
+
"${ffmpeg_major_version}"
|
|
64
|
+
"$ENV{TORCHCODEC_FFMPEG${ffmpeg_major_version}_INSTALL_PREFIX}"
|
|
65
|
+
)
|
|
66
|
+
add_torchcodec_target(${ffmpeg_major_version})
|
|
67
|
+
set(use_pkg_config FALSE)
|
|
68
|
+
endif()
|
|
69
|
+
endforeach()
|
|
70
|
+
|
|
71
|
+
if (use_pkg_config)
|
|
72
|
+
add_ffmpeg_target_with_pkg_config(ffmpeg_major_version)
|
|
73
|
+
add_torchcodec_target(${ffmpeg_major_version})
|
|
74
|
+
endif()
|
|
75
|
+
|
|
76
|
+
find_package_handle_standard_args(TorchCodec DEFAULT_MSG TORCHCODEC_VARIANTS)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# This file exposes helpers to create and expose FFmpeg targets as torchcodec::ffmpeg${N}
|
|
2
|
+
# where N is the FFmpeg major version.
|
|
3
|
+
|
|
4
|
+
# List of FFmpeg versions that TorchCodec can support - that's not a list of
|
|
5
|
+
# FFmpeg versions available on the current system!
|
|
6
|
+
set(TORCHCODEC_SUPPORTED_FFMPEG_VERSIONS "4;5;6;7;8")
|
|
7
|
+
|
|
8
|
+
# Create and expose torchcodec::ffmpeg${ffmpeg_major_version} target which can
|
|
9
|
+
# then be used as a dependency in other targets.
|
|
10
|
+
# prefix is the path to the FFmpeg installation containing the usual `include`
|
|
11
|
+
# and `lib` directories.
|
|
12
|
+
function(add_ffmpeg_target ffmpeg_major_version prefix)
|
|
13
|
+
# Check that given ffmpeg major version is something we support and error out if
|
|
14
|
+
# it's not.
|
|
15
|
+
list(FIND TORCHCODEC_SUPPORTED_FFMPEG_VERSIONS "${ffmpeg_major_version}" _index)
|
|
16
|
+
if (_index LESS 0)
|
|
17
|
+
message(FATAL_ERROR "FFmpeg version ${ffmpeg_major_version} is not supported")
|
|
18
|
+
endif()
|
|
19
|
+
if (NOT DEFINED prefix)
|
|
20
|
+
message(FATAL_ERROR "No prefix defined calling add_ffmpeg_target()")
|
|
21
|
+
endif()
|
|
22
|
+
|
|
23
|
+
# Define library names based on platform and FFmpeg version
|
|
24
|
+
if (LINUX)
|
|
25
|
+
if (ffmpeg_major_version EQUAL 4)
|
|
26
|
+
set(library_file_names libavutil.so.56 libavcodec.so.58 libavformat.so.58 libavdevice.so.58 libavfilter.so.7 libswscale.so.5 libswresample.so.3)
|
|
27
|
+
elseif (ffmpeg_major_version EQUAL 5)
|
|
28
|
+
set(library_file_names libavutil.so.57 libavcodec.so.59 libavformat.so.59 libavdevice.so.59 libavfilter.so.8 libswscale.so.6 libswresample.so.4)
|
|
29
|
+
elseif (ffmpeg_major_version EQUAL 6)
|
|
30
|
+
set(library_file_names libavutil.so.58 libavcodec.so.60 libavformat.so.60 libavdevice.so.60 libavfilter.so.9 libswscale.so.7 libswresample.so.4)
|
|
31
|
+
elseif (ffmpeg_major_version EQUAL 7)
|
|
32
|
+
set(library_file_names libavutil.so.59 libavcodec.so.61 libavformat.so.61 libavdevice.so.61 libavfilter.so.10 libswscale.so.8 libswresample.so.5)
|
|
33
|
+
elseif (ffmpeg_major_version EQUAL 8)
|
|
34
|
+
set(library_file_names libavutil.so.60 libavcodec.so.62 libavformat.so.62 libavdevice.so.62 libavfilter.so.11 libswscale.so.9 libswresample.so.6)
|
|
35
|
+
endif()
|
|
36
|
+
elseif (APPLE)
|
|
37
|
+
if (ffmpeg_major_version EQUAL 4)
|
|
38
|
+
set(library_file_names libavutil.56.dylib libavcodec.58.dylib libavformat.58.dylib libavdevice.58.dylib libavfilter.7.dylib libswscale.5.dylib libswresample.3.dylib)
|
|
39
|
+
elseif (ffmpeg_major_version EQUAL 5)
|
|
40
|
+
set(library_file_names libavutil.57.dylib libavcodec.59.dylib libavformat.59.dylib libavdevice.59.dylib libavfilter.8.dylib libswscale.6.dylib libswresample.4.dylib)
|
|
41
|
+
elseif (ffmpeg_major_version EQUAL 6)
|
|
42
|
+
set(library_file_names libavutil.58.dylib libavcodec.60.dylib libavformat.60.dylib libavdevice.60.dylib libavfilter.9.dylib libswscale.7.dylib libswresample.4.dylib)
|
|
43
|
+
elseif (ffmpeg_major_version EQUAL 7)
|
|
44
|
+
set(library_file_names libavutil.59.dylib libavcodec.61.dylib libavformat.61.dylib libavdevice.61.dylib libavfilter.10.dylib libswscale.8.dylib libswresample.5.dylib)
|
|
45
|
+
elseif (ffmpeg_major_version EQUAL 8)
|
|
46
|
+
set(library_file_names libavutil.60.dylib libavcodec.62.dylib libavformat.62.dylib libavdevice.62.dylib libavfilter.11.dylib libswscale.9.dylib libswresample.6.dylib)
|
|
47
|
+
endif()
|
|
48
|
+
elseif (WIN32)
|
|
49
|
+
set(library_file_names avutil.lib avcodec.lib avformat.lib avdevice.lib avfilter.lib swscale.lib swresample.lib)
|
|
50
|
+
else()
|
|
51
|
+
message(FATAL_ERROR "Unsupported operating system: ${CMAKE_SYSTEM_NAME}")
|
|
52
|
+
endif()
|
|
53
|
+
|
|
54
|
+
set(target "torchcodec::ffmpeg${ffmpeg_major_version}")
|
|
55
|
+
set(include_dir "${prefix}/include")
|
|
56
|
+
if (LINUX OR APPLE)
|
|
57
|
+
set(lib_dir "${prefix}/lib")
|
|
58
|
+
elseif (WIN32)
|
|
59
|
+
set(lib_dir "${prefix}/bin")
|
|
60
|
+
else()
|
|
61
|
+
message(FATAL_ERROR "Unsupported operating system: ${CMAKE_SYSTEM_NAME}")
|
|
62
|
+
endif()
|
|
63
|
+
|
|
64
|
+
list(
|
|
65
|
+
TRANSFORM library_file_names
|
|
66
|
+
PREPEND ${lib_dir}/
|
|
67
|
+
OUTPUT_VARIABLE lib_paths
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
message("Adding ${target} target")
|
|
71
|
+
# Verify that ffmpeg includes and libraries actually exist.
|
|
72
|
+
foreach (path IN LISTS include_dir lib_paths)
|
|
73
|
+
if (NOT EXISTS "${path}")
|
|
74
|
+
message(FATAL_ERROR "${path} does not exist")
|
|
75
|
+
endif()
|
|
76
|
+
endforeach()
|
|
77
|
+
|
|
78
|
+
# Actually define the target
|
|
79
|
+
add_library(${target} INTERFACE IMPORTED)
|
|
80
|
+
target_include_directories(${target} INTERFACE ${include_dir})
|
|
81
|
+
target_link_libraries(${target} INTERFACE ${lib_paths})
|
|
82
|
+
endfunction()
|
|
83
|
+
|
|
84
|
+
# Create and expose torchcodec::ffmpeg${ffmpeg_major_version} target which can
|
|
85
|
+
# then be used as a dependency in other targets.
|
|
86
|
+
# The FFmpeg installation is found by pkg-config.
|
|
87
|
+
function(add_ffmpeg_target_with_pkg_config ret_ffmpeg_major_version_var)
|
|
88
|
+
find_package(PkgConfig REQUIRED)
|
|
89
|
+
pkg_check_modules(TORCHCODEC_LIBAV REQUIRED IMPORTED_TARGET
|
|
90
|
+
libavdevice
|
|
91
|
+
libavfilter
|
|
92
|
+
libavformat
|
|
93
|
+
libavcodec
|
|
94
|
+
libavutil
|
|
95
|
+
libswresample
|
|
96
|
+
libswscale
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Split libavcodec's version string by '.' and convert it to a list
|
|
100
|
+
# The TORCHCODEC_LIBAV_libavcodec_VERSION is made available by pkg-config.
|
|
101
|
+
string(REPLACE "." ";" libavcodec_version_list ${TORCHCODEC_LIBAV_libavcodec_VERSION})
|
|
102
|
+
# Get the first element of the list, which is the major version
|
|
103
|
+
list(GET libavcodec_version_list 0 libavcodec_major_version)
|
|
104
|
+
|
|
105
|
+
if (${libavcodec_major_version} STREQUAL "58")
|
|
106
|
+
set(ffmpeg_major_version "4")
|
|
107
|
+
elseif (${libavcodec_major_version} STREQUAL "59")
|
|
108
|
+
set(ffmpeg_major_version "5")
|
|
109
|
+
elseif (${libavcodec_major_version} STREQUAL "60")
|
|
110
|
+
set(ffmpeg_major_version "6")
|
|
111
|
+
elseif (${libavcodec_major_version} STREQUAL "61")
|
|
112
|
+
set(ffmpeg_major_version "7")
|
|
113
|
+
elseif (${libavcodec_major_version} STREQUAL "62")
|
|
114
|
+
set(ffmpeg_major_version "8")
|
|
115
|
+
else()
|
|
116
|
+
message(FATAL_ERROR "Unsupported libavcodec version: ${libavcodec_major_version}")
|
|
117
|
+
endif()
|
|
118
|
+
|
|
119
|
+
message("Adding torchcodec::ffmpeg${ffmpeg_major_version} target")
|
|
120
|
+
add_library(torchcodec::ffmpeg${ffmpeg_major_version} ALIAS PkgConfig::TORCHCODEC_LIBAV)
|
|
121
|
+
set(${ret_ffmpeg_major_version_var} ${ffmpeg_major_version} PARENT_SCOPE)
|
|
122
|
+
endfunction()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from ._decoder_transforms import ( # noqa
|
|
8
|
+
CenterCrop,
|
|
9
|
+
DecoderTransform,
|
|
10
|
+
RandomCrop,
|
|
11
|
+
Resize,
|
|
12
|
+
)
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from types import ModuleType
|
|
11
|
+
|
|
12
|
+
import torch
|
|
13
|
+
from torch import nn
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DecoderTransform(ABC):
|
|
17
|
+
"""Base class for all decoder transforms.
|
|
18
|
+
|
|
19
|
+
A *decoder transform* is a transform that is applied by the decoder before
|
|
20
|
+
returning the decoded frame. Applying decoder transforms to frames
|
|
21
|
+
should be both faster and more memory efficient than receiving normally
|
|
22
|
+
decoded frames and applying the same kind of transform.
|
|
23
|
+
|
|
24
|
+
Most ``DecoderTransform`` objects have a complementary transform in TorchVision,
|
|
25
|
+
specificially in `torchvision.transforms.v2 <https://docs.pytorch.org/vision/stable/transforms.html>`_.
|
|
26
|
+
For such transforms, we ensure that:
|
|
27
|
+
|
|
28
|
+
1. The names are the same.
|
|
29
|
+
2. Default behaviors are the same.
|
|
30
|
+
3. The parameters for the ``DecoderTransform`` object are a subset of the
|
|
31
|
+
TorchVision :class:`~torchvision.transforms.v2.Transform` object.
|
|
32
|
+
4. Parameters with the same name control the same behavior and accept a
|
|
33
|
+
subset of the same types.
|
|
34
|
+
5. The difference between the frames returned by a decoder transform and
|
|
35
|
+
the complementary TorchVision transform are such that a model should
|
|
36
|
+
not be able to tell the difference.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def _make_transform_spec(self, input_dims: tuple[int | None, int | None]) -> str:
|
|
41
|
+
"""Makes the transform spec that is used by the `VideoDecoder`.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
input_dims (tuple[int | None, int | None]): The dimensions of
|
|
45
|
+
the input frame in the form (height, width). We cannot know the
|
|
46
|
+
dimensions at object construction time because it's dependent on
|
|
47
|
+
the video being decoded and upstream transforms in the same
|
|
48
|
+
transform pipeline. Not all transforms need to know this; those
|
|
49
|
+
that don't will ignore it. The individual values in the tuple are
|
|
50
|
+
optional because the original values come from file metadata which
|
|
51
|
+
may be missing. We maintain the optionality throughout the APIs so
|
|
52
|
+
that we can decide as late as possible that it's necessary for the
|
|
53
|
+
values to exist. That is, if the values are missing from the
|
|
54
|
+
metadata and we have transforms which ignore the input dimensions,
|
|
55
|
+
we want that to still work.
|
|
56
|
+
|
|
57
|
+
Note: This method is the moral equivalent of TorchVision's
|
|
58
|
+
`Transform.make_params()`.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
str: A string which contains the spec for the transform that the
|
|
62
|
+
`VideoDecoder` knows what to do with.
|
|
63
|
+
"""
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
def _get_output_dims(self) -> tuple[int | None, int | None] | None:
|
|
67
|
+
"""Get the dimensions of the output frame.
|
|
68
|
+
|
|
69
|
+
Transforms that change the frame dimensions need to override this
|
|
70
|
+
method. Transforms that don't change the frame dimensions can rely on
|
|
71
|
+
this default implementation.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
tuple[int | None, int | None] | None: The output dimensions.
|
|
75
|
+
- None: The output dimensions are the same as the input dimensions.
|
|
76
|
+
- (int, int): The (height, width) of the output frame.
|
|
77
|
+
"""
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def import_torchvision_transforms_v2() -> ModuleType:
|
|
82
|
+
try:
|
|
83
|
+
from torchvision.transforms import v2
|
|
84
|
+
except ImportError as e:
|
|
85
|
+
raise RuntimeError(
|
|
86
|
+
"Cannot import TorchVision; this should never happen, please report a bug."
|
|
87
|
+
) from e
|
|
88
|
+
return v2
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class Resize(DecoderTransform):
|
|
92
|
+
"""Resize the decoded frame to a given size.
|
|
93
|
+
|
|
94
|
+
Complementary TorchVision transform: :class:`~torchvision.transforms.v2.Resize`.
|
|
95
|
+
Interpolation is always bilinear. Anti-aliasing is always on.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
size (Sequence[int]): Desired output size. Must be a sequence of
|
|
99
|
+
the form (height, width).
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, size: Sequence[int]):
|
|
103
|
+
if len(size) != 2:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
"Resize transform must have a (height, width) "
|
|
106
|
+
f"pair for the size, got {size}."
|
|
107
|
+
)
|
|
108
|
+
self.size = size
|
|
109
|
+
|
|
110
|
+
def _make_transform_spec(self, input_dims: tuple[int | None, int | None]) -> str:
|
|
111
|
+
return f"resize, {self.size[0]}, {self.size[1]}"
|
|
112
|
+
|
|
113
|
+
def _get_output_dims(self) -> tuple[int | None, int | None] | None:
|
|
114
|
+
return (self.size[0], self.size[1])
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def _from_torchvision(cls, tv_resize: nn.Module):
|
|
118
|
+
v2 = import_torchvision_transforms_v2()
|
|
119
|
+
|
|
120
|
+
assert isinstance(tv_resize, v2.Resize)
|
|
121
|
+
|
|
122
|
+
if tv_resize.interpolation is not v2.InterpolationMode.BILINEAR:
|
|
123
|
+
raise ValueError(
|
|
124
|
+
"TorchVision Resize transform must use bilinear interpolation."
|
|
125
|
+
)
|
|
126
|
+
if tv_resize.antialias is False:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
"TorchVision Resize transform must have antialias enabled."
|
|
129
|
+
)
|
|
130
|
+
if tv_resize.size is None:
|
|
131
|
+
raise ValueError("TorchVision Resize transform must have a size specified.")
|
|
132
|
+
if len(tv_resize.size) != 2:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"TorchVision Resize transform must have a (height, width) "
|
|
135
|
+
f"pair for the size, got {tv_resize.size}."
|
|
136
|
+
)
|
|
137
|
+
return cls(size=tv_resize.size)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class CenterCrop(DecoderTransform):
|
|
141
|
+
"""Crop the decoded frame to a given size in the center of the frame.
|
|
142
|
+
|
|
143
|
+
Complementary TorchVision transform: :class:`~torchvision.transforms.v2.CenterCrop`.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
size (Sequence[int]): Desired output size. Must be a sequence of
|
|
147
|
+
the form (height, width).
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(self, size: Sequence[int]):
|
|
151
|
+
if len(size) != 2:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"CenterCrop transform must have a (height, width) "
|
|
154
|
+
f"pair for the size, got {size}."
|
|
155
|
+
)
|
|
156
|
+
self.size = size
|
|
157
|
+
|
|
158
|
+
def _make_transform_spec(self, input_dims: tuple[int | None, int | None]) -> str:
|
|
159
|
+
return f"center_crop, {self.size[0]}, {self.size[1]}"
|
|
160
|
+
|
|
161
|
+
def _get_output_dims(self) -> tuple[int | None, int | None] | None:
|
|
162
|
+
return (self.size[0], self.size[1])
|
|
163
|
+
|
|
164
|
+
@classmethod
|
|
165
|
+
def _from_torchvision(
|
|
166
|
+
cls,
|
|
167
|
+
tv_center_crop: nn.Module,
|
|
168
|
+
):
|
|
169
|
+
v2 = import_torchvision_transforms_v2()
|
|
170
|
+
|
|
171
|
+
if not isinstance(tv_center_crop, v2.CenterCrop):
|
|
172
|
+
raise ValueError(
|
|
173
|
+
"Transform must be TorchVision's CenterCrop, "
|
|
174
|
+
f"it is instead {type(tv_center_crop).__name__}. "
|
|
175
|
+
"This should never happen, please report a bug."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if len(tv_center_crop.size) != 2:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"TorchVision CenterCrop transform must have a (height, width) "
|
|
181
|
+
f"pair for the size, got {tv_center_crop.size}."
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return cls(size=tv_center_crop.size)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class RandomCrop(DecoderTransform):
|
|
188
|
+
"""Crop the decoded frame to a given size at a random location in the frame.
|
|
189
|
+
|
|
190
|
+
Complementary TorchVision transform: :class:`~torchvision.transforms.v2.RandomCrop`.
|
|
191
|
+
Padding of all kinds is disabled. The random location within the frame is
|
|
192
|
+
determined during the initialization of the
|
|
193
|
+
:class:`~torchcodec.decoders.VideoDecoder` object that owns this transform.
|
|
194
|
+
As a consequence, each decoded frame in the video will be cropped at the
|
|
195
|
+
same location. Videos with variable resolution may result in undefined
|
|
196
|
+
behavior.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
size (Sequence[int]): Desired output size. Must be a sequence of
|
|
200
|
+
the form (height, width).
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self, size: Sequence[int]):
|
|
204
|
+
if len(size) != 2:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"RandomCrop transform must have a (height, width) "
|
|
207
|
+
f"pair for the size, got {size}."
|
|
208
|
+
)
|
|
209
|
+
self.size = size
|
|
210
|
+
|
|
211
|
+
def _make_transform_spec(self, input_dims: tuple[int | None, int | None]) -> str:
|
|
212
|
+
height, width = input_dims
|
|
213
|
+
if height is None:
|
|
214
|
+
raise ValueError(
|
|
215
|
+
"Video metadata has no height. "
|
|
216
|
+
"RandomCrop can only be used when input frame dimensions are known."
|
|
217
|
+
)
|
|
218
|
+
if width is None:
|
|
219
|
+
raise ValueError(
|
|
220
|
+
"Video metadata has no width. "
|
|
221
|
+
"RandomCrop can only be used when input frame dimensions are known."
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Note: This logic below must match the logic in
|
|
225
|
+
# torchvision.transforms.v2.RandomCrop.make_params(). Given
|
|
226
|
+
# the same seed, they should get the same result. This is an
|
|
227
|
+
# API guarantee with our users.
|
|
228
|
+
if height < self.size[0] or width < self.size[1]:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Input dimensions {input_dims} are smaller than the crop size {self.size}."
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
top = int(torch.randint(0, height - self.size[0] + 1, size=()).item())
|
|
234
|
+
left = int(torch.randint(0, width - self.size[1] + 1, size=()).item())
|
|
235
|
+
|
|
236
|
+
return f"crop, {self.size[0]}, {self.size[1]}, {left}, {top}"
|
|
237
|
+
|
|
238
|
+
def _get_output_dims(self) -> tuple[int | None, int | None] | None:
|
|
239
|
+
return (self.size[0], self.size[1])
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def _from_torchvision(
|
|
243
|
+
cls,
|
|
244
|
+
tv_random_crop: nn.Module,
|
|
245
|
+
):
|
|
246
|
+
v2 = import_torchvision_transforms_v2()
|
|
247
|
+
|
|
248
|
+
if not isinstance(tv_random_crop, v2.RandomCrop):
|
|
249
|
+
raise ValueError(
|
|
250
|
+
"Transform must be TorchVision's RandomCrop, "
|
|
251
|
+
f"it is instead {type(tv_random_crop).__name__}. "
|
|
252
|
+
"This should never happen, please report a bug."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if tv_random_crop.padding is not None:
|
|
256
|
+
raise ValueError(
|
|
257
|
+
"TorchVision RandomCrop transform must not specify padding."
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
if tv_random_crop.pad_if_needed is True:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
"TorchVision RandomCrop transform must not specify pad_if_needed."
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
if tv_random_crop.fill != 0:
|
|
266
|
+
raise ValueError("TorchVision RandomCrop fill must be 0.")
|
|
267
|
+
|
|
268
|
+
if tv_random_crop.padding_mode != "constant":
|
|
269
|
+
raise ValueError("TorchVision RandomCrop padding_mode must be constant.")
|
|
270
|
+
|
|
271
|
+
if len(tv_random_crop.size) != 2:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
"TorchVision RandcomCrop transform must have a (height, width) "
|
|
274
|
+
f"pair for the size, got {tv_random_crop.size}."
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
return cls(size=tv_random_crop.size)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _make_transform_specs(
|
|
281
|
+
transforms: Sequence[DecoderTransform | nn.Module] | None,
|
|
282
|
+
input_dims: tuple[int | None, int | None],
|
|
283
|
+
) -> str:
|
|
284
|
+
"""Given a sequence of transforms, turn those into the specification string
|
|
285
|
+
the core API expects.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
transforms: Optional sequence of transform objects. The objects can be
|
|
289
|
+
one of two types:
|
|
290
|
+
1. torchcodec.transforms.DecoderTransform
|
|
291
|
+
2. torchvision.transforms.v2.Transform, but our type annotation
|
|
292
|
+
only mentions its base, nn.Module. We don't want to take a
|
|
293
|
+
hard dependency on TorchVision.
|
|
294
|
+
input_dims: Optional (height, width) pair. Note that only some
|
|
295
|
+
transforms need to know the dimensions. If the user provides
|
|
296
|
+
transforms that don't need to know the dimensions, and that metadata
|
|
297
|
+
is missing, everything should still work. That means we assert their
|
|
298
|
+
existence as late as possible.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
String of transforms in the format the core API expects: transform
|
|
302
|
+
specifications separate by semicolons.
|
|
303
|
+
"""
|
|
304
|
+
if transforms is None:
|
|
305
|
+
return ""
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
from torchvision.transforms import v2
|
|
309
|
+
|
|
310
|
+
tv_available = True
|
|
311
|
+
except ImportError:
|
|
312
|
+
tv_available = False
|
|
313
|
+
|
|
314
|
+
# The following loop accomplishes two tasks:
|
|
315
|
+
#
|
|
316
|
+
# 1. Converts the transform to a DecoderTransform, if necessary. We
|
|
317
|
+
# accept TorchVision transform objects and they must be converted
|
|
318
|
+
# to their matching DecoderTransform.
|
|
319
|
+
# 2. Calculates what the input dimensions are to each transform.
|
|
320
|
+
#
|
|
321
|
+
# The order in our transforms list is semantically meaningful, as we
|
|
322
|
+
# actually have a pipeline where the output of one transform is the input to
|
|
323
|
+
# the next. For example, if we have the transforms list [A, B, C, D], then
|
|
324
|
+
# we should understand that as:
|
|
325
|
+
#
|
|
326
|
+
# A -> B -> C -> D
|
|
327
|
+
#
|
|
328
|
+
# Where the frame produced by A is the input to B, the frame produced by B
|
|
329
|
+
# is the input to C, etc. This particularly matters for frame dimensions.
|
|
330
|
+
# Transforms can both:
|
|
331
|
+
#
|
|
332
|
+
# 1. Produce frames with arbitrary dimensions.
|
|
333
|
+
# 2. Rely on their input frame's dimensions to calculate ahead-of-time
|
|
334
|
+
# what their runtime behavior will be.
|
|
335
|
+
#
|
|
336
|
+
# The consequence of the above facts is that we need to statically track
|
|
337
|
+
# frame dimensions in the pipeline while we pre-process it. The input
|
|
338
|
+
# frame's dimensions to A, our first transform, is always what we know from
|
|
339
|
+
# our metadata. For each transform, we always calculate its output
|
|
340
|
+
# dimensions from its input dimensions. We store these with the converted
|
|
341
|
+
# transform, to be all used together when we generate the specs.
|
|
342
|
+
converted_transforms: list[
|
|
343
|
+
tuple[
|
|
344
|
+
DecoderTransform,
|
|
345
|
+
# A (height, width) pair where the values may be missing.
|
|
346
|
+
tuple[int | None, int | None],
|
|
347
|
+
]
|
|
348
|
+
] = []
|
|
349
|
+
curr_input_dims = input_dims
|
|
350
|
+
for transform in transforms:
|
|
351
|
+
if not isinstance(transform, DecoderTransform):
|
|
352
|
+
if not tv_available:
|
|
353
|
+
raise ValueError(
|
|
354
|
+
f"The supplied transform, {transform}, is not a TorchCodec "
|
|
355
|
+
" DecoderTransform. TorchCodec also accepts TorchVision "
|
|
356
|
+
"v2 transforms, but TorchVision is not installed."
|
|
357
|
+
)
|
|
358
|
+
elif isinstance(transform, v2.Resize):
|
|
359
|
+
transform = Resize._from_torchvision(transform)
|
|
360
|
+
elif isinstance(transform, v2.CenterCrop):
|
|
361
|
+
transform = CenterCrop._from_torchvision(transform)
|
|
362
|
+
elif isinstance(transform, v2.RandomCrop):
|
|
363
|
+
transform = RandomCrop._from_torchvision(transform)
|
|
364
|
+
else:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f"Unsupported transform: {transform}. Transforms must be "
|
|
367
|
+
"either a TorchCodec DecoderTransform or a TorchVision "
|
|
368
|
+
"v2 transform."
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
converted_transforms.append((transform, curr_input_dims))
|
|
372
|
+
output_dims = transform._get_output_dims()
|
|
373
|
+
curr_input_dims = output_dims if output_dims is not None else curr_input_dims
|
|
374
|
+
|
|
375
|
+
return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
|
torchcodec/version.py
ADDED