PyPI - dct-autoencoder - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dct-autoencoder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

dct_autoencoder/__init__.py +2 -0
dct_autoencoder/basis.py +56 -0
dct_autoencoder/core.py +260 -0
dct_autoencoder/utils.py +46 -0
dct_autoencoder/visualization.py +52 -0
dct_autoencoder-0.1.0.dist-info/LICENSE +21 -0
dct_autoencoder-0.1.0.dist-info/METADATA +34 -0
dct_autoencoder-0.1.0.dist-info/RECORD +9 -0
dct_autoencoder-0.1.0.dist-info/WHEEL +4 -0

dct_autoencoder/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .basis import DCTBasis, get_dct_basis
2	+ from .core import DCTAutoencoder

dct_autoencoder/basis.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import NamedTuple
+import numpy as np
+class DCTBasis(NamedTuple):
+    basis_functions: np.ndarray
+    spatial_frequencies_components: np.ndarray
+    spatial_frequencies_magnitude: np.ndarray
+    multiplication_factor_matrix: np.ndarray
+    multiplication_factor_scalar: float
+    block_size: int
+def get_dct_basis(block_size: int = 8) -> DCTBasis:
+    """Generate the DCT basis variables for a given block size.
+    Args:
+        block_size (int, optional): The block size. Defaults to 8.
+    Returns:
+        DCTBasis: The DCT basis variables.
+    """
+    frequencies = np.arange(block_size)
+    x = np.arange(block_size)
+    y = np.arange(block_size)
+    x, y = np.meshgrid(x, y, indexing="xy")
+    basis_functions = np.zeros(
+        (block_size, block_size, block_size, block_size), dtype=np.float32
+    )
+    spatial_frequencies = np.zeros((block_size, block_size, 2), dtype=np.int64)
+    multiplication_factor_matrix = np.zeros((block_size, block_size), dtype=np.float32)
+    for v in frequencies:
+        for u in frequencies:
+            # spatial frequencies
+            spatial_frequencies[v, u] = (v, u)
+            # basis functions
+            x_ref_patch = np.cos(((2 * x + 1) * u * np.pi) / (2 * block_size))
+            y_ref_patch = np.cos(((2 * y + 1) * v * np.pi) / (2 * block_size))
+            basis_functions[v, u] = x_ref_patch * y_ref_patch
+            # constants
+            c_v = 1 / np.sqrt(2) if v == 0 else 1
+            c_u = 1 / np.sqrt(2) if u == 0 else 1
+            multiplication_factor_matrix[v, u] = c_u * c_v
+    spatial_frequencies_magnitude = np.linalg.norm(spatial_frequencies, axis=2)
+    multiplication_factor_scalar = 2 / block_size
+    return DCTBasis(
+        basis_functions=basis_functions,
+        spatial_frequencies_components=spatial_frequencies,
+        spatial_frequencies_magnitude=spatial_frequencies_magnitude,
+        multiplication_factor_matrix=multiplication_factor_matrix,
+        multiplication_factor_scalar=multiplication_factor_scalar,
+        block_size=block_size,
+    )

dct_autoencoder/core.py ADDED Viewed

@@ -0,0 +1,260 @@
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .basis import get_dct_basis
+from .utils import rgb_to_ycbcr, ycbcr_to_rgb
+class DCTAutoencoder(nn.Module):
+    """DCT Autoencoder.
+    Args:
+        block_size (int, optional): The block size. Defaults to 8.
+    """
+    def __init__(self, block_size: int = 8) -> None:
+        super().__init__()
+        dct_basis = get_dct_basis(block_size)
+        basis_functions = dct_basis.basis_functions
+        kernels = basis_functions.reshape(-1, block_size, block_size)
+        spatial_frequencies_magnitude = dct_basis.spatial_frequencies_magnitude.reshape(
+            -1
+        )
+        sort_indices = np.argsort(spatial_frequencies_magnitude)
+        kernels = kernels[sort_indices]
+        spatial_frequencies_magnitude = spatial_frequencies_magnitude[sort_indices]
+        kernels = kernels[:, np.newaxis, :, :]
+        multiplication_factor_scalar = dct_basis.multiplication_factor_scalar
+        multiplication_factor_matrix = dct_basis.multiplication_factor_matrix
+        multiplication_factor_matrix = multiplication_factor_matrix.reshape(-1)
+        multiplication_factor_matrix = multiplication_factor_matrix[sort_indices]
+        multiplication_factor_matrix = multiplication_factor_matrix[
+            np.newaxis, :, np.newaxis, np.newaxis
+        ]
+        self.register_buffer("kernels", torch.from_numpy(kernels))
+        self.register_buffer(
+            "spatial_frequencies_magnitude",
+            torch.from_numpy(spatial_frequencies_magnitude),
+        )
+        self.register_buffer("block_size", torch.tensor(block_size))
+        self.register_buffer(
+            "multiplication_factor_scalar", torch.tensor(multiplication_factor_scalar)
+        )
+        self.register_buffer(
+            "multiplication_factor_matrix",
+            torch.from_numpy(multiplication_factor_matrix),
+        )
+        self.embedding_dimension = (block_size**2) * 3
+    def encode(self, rgb_images_batch: torch.Tensor) -> torch.Tensor:
+        """Encodes the input RGB images.
+        Args:
+            rgb_images_batch (torch.Tensor): The input RGB images. The images should
+                have shape (*, 3, height, width). Image values should be in the range [0, 1].
+        Returns:
+            torch.Tensor: The encoded images.
+        """
+        # check input
+        b, c, h, w = rgb_images_batch.shape
+        if c != 3:
+            raise ValueError("Input images must be RGB")
+        if h % self.block_size != 0 or w % self.block_size != 0:
+            raise ValueError("Image dimensions must be divisible by the block size")
+        # convert to YCbCr
+        ycbcr_tsr = rgb_to_ycbcr(rgb_images_batch)
+        # normalize to -1, 1
+        ycbcr_tsr = 2 * ycbcr_tsr - 1
+        y = ycbcr_tsr[:, [0], :, :]
+        cb = ycbcr_tsr[:, [1], :, :]
+        cr = ycbcr_tsr[:, [2], :, :]
+        # DCT encode
+        c1 = self.multiplication_factor_scalar
+        c2 = self.multiplication_factor_matrix
+        y = c1 * c2 * F.conv2d(y, self.kernels, stride=self.block_size.item())
+        cb = c1 * c2 * F.conv2d(cb, self.kernels, stride=self.block_size.item())
+        cr = c1 * c2 * F.conv2d(cr, self.kernels, stride=self.block_size.item())
+        return torch.cat([y, cb, cr], dim=1)
+    def decode(self, encodings_batch: torch.Tensor) -> torch.Tensor:
+        """Decodes the input encoded images.
+        Args:
+            encodings_batch (torch.Tensor): The input encoded images.
+        Returns:
+            torch.Tensor: The decoded images.
+        """
+        org_ch = self.block_size**2
+        y = encodings_batch[:, :org_ch, :, :]
+        cb = encodings_batch[:, org_ch : org_ch * 2, :, :]
+        cr = encodings_batch[:, org_ch * 2 :, :, :]
+        # DCT Decode
+        c1 = self.multiplication_factor_scalar
+        c2 = self.multiplication_factor_matrix
+        y = c1 * F.conv_transpose2d(y * c2, self.kernels, stride=self.block_size.item())
+        cb = c1 * F.conv_transpose2d(
+            cb * c2, self.kernels, stride=self.block_size.item()
+        )
+        cr = c1 * F.conv_transpose2d(
+            cr * c2, self.kernels, stride=self.block_size.item()
+        )
+        # convert to RGB
+        ycbcr_tsr = torch.cat([y, cb, cr], dim=1)
+        ycbcr_tsr = ycbcr_tsr / 2 + 0.5
+        rgb_images_batch = ycbcr_to_rgb(ycbcr_tsr)
+        return rgb_images_batch
+    def get_num_compressed_channels(
+        self,
+        luminance_compression_ratio: float = 1 / 2,
+        chrominance_compression_ratio: float = 1 / 4,
+    ) -> int:
+        """Get the number of compressed channels.
+        Args:
+            luminance_compression_ratio (float, optional): The luminance compression
+                ratio. Defaults to 1/2.
+            chrominance_compression_ratio (float, optional): The chrominance compression
+                ratio. Defaults to 1/4.
+        Returns:
+            int: The number of compressed channels.
+        """
+        num_per_channel_encodings = self.block_size**2
+        num_luminance_encodings = torch.round(
+            num_per_channel_encodings * luminance_compression_ratio
+        ).int()
+        num_chrominance_encodings = torch.round(
+            num_per_channel_encodings * chrominance_compression_ratio
+        ).int()
+        return (num_luminance_encodings + 2 * num_chrominance_encodings).item()
+    def compress(
+        self,
+        encodings_batch: torch.Tensor,
+        luminance_compression_ratio: float = 1 / 2,
+        chrominance_compression_ratio: float = 1 / 4,
+    ) -> torch.Tensor:
+        """Compresses the input encodings.
+        Args:
+            encodings_batch (torch.Tensor): The input encodings.
+            luminance_compression_ratio (float, optional): The luminance compression
+                ratio. Defaults to 1/2.
+            chrominance_compression_ratio (float, optional): The chrominance compression
+                ratio. Defaults to 1/4.
+        Returns:
+            torch.Tensor: The compressed encodings.
+        """
+        num_per_channel_encodings = self.block_size**2
+        num_luminance_encodings = torch.round(
+            num_per_channel_encodings * luminance_compression_ratio
+        ).int()
+        num_chrominance_encodings = torch.round(
+            num_per_channel_encodings * chrominance_compression_ratio
+        ).int()
+        luminance_encodings = encodings_batch[:, :num_per_channel_encodings]
+        chrominance_blue_encodings = encodings_batch[
+            :, num_per_channel_encodings : 2 * num_per_channel_encodings
+        ]
+        chrominance_red_encodings = encodings_batch[:, 2 * num_per_channel_encodings :]
+        luminance_encodings = luminance_encodings[:, :num_luminance_encodings]
+        chrominance_blue_encodings = chrominance_blue_encodings[
+            :, :num_chrominance_encodings
+        ]
+        chrominance_red_encodings = chrominance_red_encodings[
+            :, :num_chrominance_encodings
+        ]
+        compressed_dct_encodings = torch.cat(
+            [
+                luminance_encodings,
+                chrominance_blue_encodings,
+                chrominance_red_encodings,
+            ],
+            dim=1,
+        )
+        return compressed_dct_encodings
+    def decompress(
+        self,
+        compressed_encodings_batch: torch.Tensor,
+        luminance_compression_ratio: float = 1 / 2,
+        chrominance_compression_ratio: float = 1 / 4,
+    ) -> torch.Tensor:
+        """Decompresses the input compressed encodings.
+        Args:
+            compressed_encodings_batch (torch.Tensor): The input compressed encodings.
+            luminance_compression_ratio (float, optional): The luminance compression
+                ratio. Defaults to 1/2.
+            chrominance_compression_ratio (float, optional): The chrominance compression
+                ratio. Defaults to 1/4.
+        Returns:
+            torch.Tensor: The decompressed encodings.
+        """
+        b, _, h, w = compressed_encodings_batch.shape
+        dtype = compressed_encodings_batch.dtype
+        device = compressed_encodings_batch.device
+        num_per_channel_encodings = self.block_size**2
+        num_luminance_encodings = torch.floor(
+            num_per_channel_encodings * luminance_compression_ratio
+        ).int()
+        num_chrominance_encodings = torch.floor(
+            num_per_channel_encodings * chrominance_compression_ratio
+        ).int()
+        compressed_luminance_encodings = compressed_encodings_batch[
+            :, :num_luminance_encodings
+        ]
+        compressed_chrominance_blue_encodings = compressed_encodings_batch[
+            :,
+            num_luminance_encodings : num_luminance_encodings
+            + num_chrominance_encodings,
+        ]
+        compressed_chrominance_red_encodings = compressed_encodings_batch[
+            :, num_luminance_encodings + num_chrominance_encodings :
+        ]
+        luminance_encodings = torch.zeros(
+            b, num_per_channel_encodings, h, w, dtype=dtype, device=device
+        )
+        luminance_encodings[:, :num_luminance_encodings, :, :] = (
+            compressed_luminance_encodings
+        )
+        chrominance_blue_encodings = torch.zeros(
+            b, num_per_channel_encodings, h, w, dtype=dtype, device=device
+        )
+        chrominance_blue_encodings[:, :num_chrominance_encodings, :, :] = (
+            compressed_chrominance_blue_encodings
+        )
+        chrominance_red_encodings = torch.zeros(
+            b, num_per_channel_encodings, h, w, dtype=dtype, device=device
+        )
+        chrominance_red_encodings[:, :num_chrominance_encodings, :, :] = (
+            compressed_chrominance_red_encodings
+        )
+        decompressed_dct_encodings = torch.cat(
+            [
+                luminance_encodings,
+                chrominance_blue_encodings,
+                chrominance_red_encodings,
+            ],
+            dim=1,
+        )
+        return decompressed_dct_encodings

dct_autoencoder/utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+import torch
+def ycbcr_to_rgb(image: torch.Tensor) -> torch.Tensor:
+    """Converts an image from YCbCr to RGB color space.
+    Args:
+        image (torch.Tensor): The input image. The image should have shape
+            (*, 3, height, width). Image values should be in the range [0, 1].
+    Returns:
+        torch.Tensor: The output image in RGB color space.
+    """
+    y = image[..., 0, :, :]
+    cb = image[..., 1, :, :]
+    cr = image[..., 2, :, :]
+    delta: float = 0.5
+    cb_shifted = cb - delta
+    cr_shifted = cr - delta
+    r = y + 1.403 * cr_shifted
+    g = y - 0.714 * cr_shifted - 0.344 * cb_shifted
+    b = y + 1.773 * cb_shifted
+    return torch.stack([r, g, b], -3).clamp(0, 1)
+def rgb_to_ycbcr(image) -> torch.Tensor:
+    """Converts an image from RGB to YCbCr color space.
+    Args:
+        image (torch.Tensor): The input image. The image should have shape
+            (*, 3, height, width). Image values should be in the range [0, 1].
+    Returns:
+        torch.Tensor: The output image in YCbCr color space.
+    """
+    r = image[..., 0, :, :]
+    g = image[..., 1, :, :]
+    b = image[..., 2, :, :]
+    delta: float = 0.5
+    y = 0.299 * r + 0.587 * g + 0.114 * b
+    cb = (b - y) * 0.564 + delta
+    cr = (r - y) * 0.713 + delta
+    return torch.stack([y, cb, cr], -3)

dct_autoencoder/visualization.py ADDED Viewed

@@ -0,0 +1,52 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from .basis import DCTBasis
+def visualize_dct_basis_functions(
+    dct_constants: DCTBasis,
+    figsize: int = 8,
+    fig_facecolor: str = "#fb6a2c",
+    title_color: str = "k",
+    title_fontsize: int = 20,
+    cmap: str = "gray",
+) -> tuple:
+    """Visualize the DCT basis functions.
+    Args:
+        dct_constants (DCTBasis): The DCT basis constants.
+        figsize (int, optional): The figure size. Defaults to 8.
+        fig_facecolor (str, optional): The figure facecolor. Defaults to "#fb6a2c".
+        title_color (str, optional): The title color. Defaults to "k".
+        title_fontsize (int, optional): The title fontsize. Defaults to 20.
+        cmap (str, optional): The colormap. Defaults to "gray".
+    Returns:
+        tuple: The figure and axis.
+    """
+    block_size = dct_constants.block_size
+    basis_functions = dct_constants.basis_functions
+    basis_functions_image = np.zeros((block_size * block_size, block_size * block_size))
+    for v in range(block_size):
+        for u in range(block_size):
+            basis_functions_image[
+                v * block_size : (v + 1) * block_size,
+                u * block_size : (u + 1) * block_size,
+            ] = basis_functions[v, u]
+    plt.figure(figsize=(figsize, figsize), facecolor=fig_facecolor)
+    plt.title(
+        f"DCT Basis functions (block size: {block_size}x{block_size})",
+        color=title_color,
+        fontsize=title_fontsize,
+        fontweight="bold",
+    )
+    plt.imshow(basis_functions_image, cmap=cmap)
+    plt.axis("off")
+    for i in range(block_size):
+        plt.axhline(i * block_size - 0.5, color=fig_facecolor)
+        plt.axvline(i * block_size - 0.5, color=fig_facecolor)
+    plt.tight_layout()
+    fig = plt.gcf()
+    ax = plt.gca()
+    return fig, ax

dct_autoencoder-0.1.0.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 dariush-bahrami
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

dct_autoencoder-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,34 @@
+Metadata-Version: 2.1
+Name: dct-autoencoder
+Version: 0.1.0
+Summary:
+Author: Dariush Bahrami
+Author-email: dariushbahrami1993@gmail.com
+Requires-Python: >=3.10,<4.0
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
+Requires-Dist: numpy (>=2.1.1,<3.0.0)
+Requires-Dist: torch (>=2.4.1,<3.0.0)
+Description-Content-Type: text/markdown
+# dct-autoencoder
+2D Discrete Cosine Transform in PyTorch
+![DCT Basis Functions](./assets/figures/dct_basis_functions_block_size_16.png)
+## Usage
+Refer to the [usage notebook](./usage.ipynb) for code examples.
+## TODO
+- [x] Add support for color images
+- [x] Improve documentation
+- [ ] Add tests
+- [ ] Distribute on PyPI

dct_autoencoder-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+dct_autoencoder/__init__.py,sha256=HPbvVdAiG_hWQ2ZNruAJnr8MOWJnMMQLjjwob-verAY,76
+dct_autoencoder/basis.py,sha256=ynx3Plbts6snn7lcyfPUOCfFxUgcXMa3iYPWca7KCdI,2060
+dct_autoencoder/core.py,sha256=ikVN_yeuSNN4NXbkU3kH_m32EIrkOn1z--tij-nCJIk,9965
+dct_autoencoder/utils.py,sha256=oCFZgLAFFgV7dJoFXC3xfbtjM-fnEBpY1eSPywHIj4U,1306
+dct_autoencoder/visualization.py,sha256=Mj9Ipz2oHTfzyq8hqM1UjoRxQ2ALAtoJeRAB7GZpXCs,1818
+dct_autoencoder-0.1.0.dist-info/LICENSE,sha256=kVBYE8Z59CVgIBn5bMZF2ihgBM-2fyEDqU93DArFnQU,1072
+dct_autoencoder-0.1.0.dist-info/METADATA,sha256=4XSc-J373HyOfe4FUk51qQy44ITa6uwN_Ylclv4P70g,856
+dct_autoencoder-0.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+dct_autoencoder-0.1.0.dist-info/RECORD,,

dct_autoencoder-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: poetry-core 1.9.0
+Root-Is-Purelib: true
+Tag: py3-none-any