PyPI - stacked-linear - Versions diffs - 0.1.0__py3-none-any.whl - Mend

stacked-linear 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

stacked_linear/__init__.py +4 -0
stacked_linear/linear_layer.py +55 -0
stacked_linear/stacked_linear_layer.py +224 -0
stacked_linear-0.1.0.dist-info/METADATA +118 -0
stacked_linear-0.1.0.dist-info/RECORD +7 -0
stacked_linear-0.1.0.dist-info/WHEEL +4 -0
stacked_linear-0.1.0.dist-info/licenses/LICENSE +29 -0

stacked_linear/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .linear_layer import LinearLayer
+from .stacked_linear_layer import StackedLinearLayer
+__all__ = ["LinearLayer", "StackedLinearLayer"]

stacked_linear/linear_layer.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+import torch
+from torch import nn
+from torch.nn import functional as F
+class LinearLayer(nn.Linear):
+    """Linear layer with support for output weight subsetting.
+    This layer behaves like a normal nn.Linear but adds the ability to
+    perform the forward pass on a subset of the output features.
+    """
+    def forward(self, x: torch.Tensor, output_subset: torch.Tensor | None = None) -> torch.Tensor:
+        """Forward pass with optional output subsetting.
+        Parameters
+        ----------
+        x
+            Input tensor with shape (..., in_features).
+        output_subset
+            Indices of the output features to compute. If None, all features
+            are computed.
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with shape (..., out_features) or (..., len(output_subset)).
+        Examples
+        --------
+        >>> import torch
+        >>> layer = LinearLayer(10, 5)
+        >>> x = torch.randn(2, 10)
+        >>> # Standard forward pass
+        >>> out = layer(x)
+        >>> out.shape
+        torch.Size([2, 5])
+        >>> # Subset forward pass
+        >>> subset = torch.tensor([0, 2])
+        >>> out_subset = layer(x, output_subset=subset)
+        >>> out_subset.shape
+        torch.Size([2, 2])
+        """
+        if output_subset is None:
+            # x: (..., i) -> output: (..., o)
+            return super().forward(x)
+        elif output_subset.dim() == 1:
+            # x: (..., i) -> output_subset: (o_subset)
+            bias = self.bias[output_subset] if self.bias is not None else None  # (o_subset)
+            weight = self.weight[output_subset]  # (o_subset, i)
+            return F.linear(x, weight, bias)  # (..., i) -> (..., o_subset)
+        else:
+            raise NotImplementedError()

stacked_linear/stacked_linear_layer.py ADDED Viewed

@@ -0,0 +1,224 @@
+from __future__ import annotations
+import math
+from typing import TYPE_CHECKING
+import torch
+from torch import nn
+if TYPE_CHECKING:
+    from typing import Any
+class StackedLinearLayer(nn.Module):
+    """A parallel stacked linear layer that applies multiple linear transformations in parallel.
+    This layer applies a linear transformation to multiple stacks/splits
+    of the input. It's particularly useful in additive decoders where
+    different splits should be calculated in parallel.
+    Parameters
+    ----------
+    n_stacks
+        Number of stacks/splits to process in parallel.
+    in_features
+        Number of input features per stack.
+    out_features
+        Number of output features per stack.
+    bias
+        Whether to include bias terms for each stack.
+    device
+        Device to place the layer on.
+    dtype
+        Data type for the layer parameters.
+    Notes
+    -----
+    The layer maintains separate weight and bias parameters for each stack:
+    - Weight shape: (n_stacks, in_features, out_features)
+    - Bias shape: (n_stacks, out_features) if bias=True, None otherwise
+    The forward pass applies the transformation to each stack independently:
+    output[b, s, o] = sum_i(x[b, s, i] * weight[s, i, o]) + bias[s, o]
+    This is equivalent to applying n_stacks separate linear layers in parallel,
+    which is more efficient than using separate nn.Linear layers.
+    Examples
+    --------
+    >>> import torch
+    >>> # Create a stacked linear layer with 4 stacks
+    >>> layer = StackedLinearLayer(n_stacks=4, in_features=64, out_features=128)
+    >>> # Input shape: (batch_size, n_stacks, in_features)
+    >>> x = torch.randn(32, 4, 64)
+    >>> # Forward pass
+    >>> output = layer(x)
+    >>> print(output.shape)  # torch.Size([32, 4, 128])
+    >>> # Each stack has its own parameters
+    >>> print(layer.weight.shape)  # torch.Size([4, 64, 128])
+    >>> print(layer.bias.shape)  # torch.Size([4, 128])
+    """
+    __constants__ = ["n_stacks", "in_features", "out_features"]
+    n_stacks: int
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+    bias: torch.Tensor | None
+    def __init__(
+        self,
+        n_stacks: int,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: Any = None,
+        dtype: Any = None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.n_stacks = n_stacks
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(torch.empty((n_stacks, in_features, out_features), **factory_kwargs))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(n_stacks, out_features, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        """Reset the layer parameters to their initial values.
+        This method reinitializes both weights and biases using the same
+        initialization strategy as the default nn.Linear layer.
+        Notes
+        -----
+        The initialization follows PyTorch's default linear layer initialization:
+        - Weights: Uniform distribution in [-1/sqrt(in_features), 1/sqrt(in_features)]
+        - Biases: Uniform distribution in [-1/sqrt(in_features), 1/sqrt(in_features)]
+        This ensures that the variance of the output is approximately preserved
+        across the layer.
+        """
+        self._init_weight()
+        self._init_bias()
+    def _init_weight(self) -> None:
+        """Initialize the weight parameters.
+        Notes
+        -----
+        Uses the same initialization as default nn.Linear:
+        Uniform distribution in [-1/sqrt(in_features), 1/sqrt(in_features)]
+        This initialization helps maintain the variance of activations
+        across the network, which is important for training stability.
+        """
+        # Same as default nn.Linear (https://github.com/pytorch/pytorch/issues/57109)
+        fan_in = self.in_features
+        bound = 1 / math.sqrt(fan_in)
+        nn.init.uniform_(self.weight, -bound, bound)
+    def _init_bias(self) -> None:
+        """Initialize the bias parameters.
+        Notes
+        -----
+        Uses the same initialization as default nn.Linear:
+        Uniform distribution in [-1/sqrt(in_features), 1/sqrt(in_features)]
+        The bias initialization is independent of the weight initialization
+        and helps ensure that the layer can learn appropriate offsets.
+        """
+        if self.bias is not None:
+            fan_in = self.in_features
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(
+        self,
+        x: torch.Tensor,
+        output_subset: torch.Tensor | None = None,
+        stack_subset: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        r"""Forward pass through the stacked linear layer.
+        Parameters
+        ----------
+        x
+            Input tensor with shape (batch_size, n_stacks, in_features).
+        output_subset
+            Subset of outputs to provide in the output.
+        stack_subset
+            Indices for stacks in operation.
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with shape (batch_size, n_stacks, out_features).
+        Notes
+        -----
+        The forward pass applies the linear transformation to each stack:
+        .. math::
+            \text{output}[b, s, o] = \\sum_{i} \text{input}[b, s, i] \\cdot \text{weight}[s, i, o] + \text{bias}[s, o]
+        where:
+        - b: batch index
+        - s: stack index
+        - i: input feature index
+        - o: output feature index
+        The computation is performed efficiently using torch.bmm or broadcasting.
+        Examples
+        --------
+        >>> import torch
+        >>> # Create layer
+        >>> layer = StackedLinearLayer(n_stacks=3, in_features=10, out_features=5)
+        >>> # Input: batch_size=2, n_stacks=3, in_features=10
+        >>> x = torch.randn(2, 3, 10)
+        >>> # Forward pass
+        >>> output = layer(x)
+        >>> print(output.shape)  # torch.Size([2, 3, 5])
+        """
+        if stack_subset is None:
+            if output_subset is None or output_subset.dim() == 1:
+                # weight: (s, i, o), bias: (s, o)
+                # x: (b, s, i), output_subset: (o_subset) -> output: (b, s, o_subset)
+                weight = self.weight if output_subset is None else self.weight[:, :, output_subset]  # (s, i, o_subset)
+                # slower: mm = torch.einsum("bsi,sio->bso", x, weight)
+                mm = torch.bmm(x.transpose(0, 1), weight).transpose(0, 1)  # (b, s, o_subset)
+                if self.bias is not None:
+                    bias = self.bias if output_subset is None else self.bias[:, output_subset]  # (s, o_subset)
+                    mm = mm + bias  # They (bso, so) will broadcast well
+                return mm
+            else:
+                raise NotImplementedError()
+        else:
+            # stack_subset: (b, s_subset)
+            # x: (b, s_subset, i), output_subset: (o_subset) -> output: (b, s_subset, o_subset)
+            weight = self.weight[stack_subset]  # (b, s_subset, i, o)
+            bias = self.bias[stack_subset] if self.bias is not None else None  # (b, s_subset, o)
+            if output_subset is None:
+                pass
+            elif output_subset.dim() == 1:
+                weight = weight[..., output_subset]  # (b, s_subset, i, o_subset)
+                bias = bias[..., output_subset] if bias is not None else None  # (b, s_subset, o_subset)
+            else:
+                raise NotImplementedError
+            mm = torch.matmul(x.unsqueeze(2), weight).squeeze(2)  # (b, s_subset, o_subset)
+            if bias is not None:
+                mm = mm + bias  # (b, s_subset, o_subset)
+            return mm
+    def extra_repr(self) -> str:
+        """String representation for printing the layer."""
+        return (
+            f"in_features={self.in_features}, out_features={self.out_features}, "
+            f"n_stacks={self.n_stacks}, bias={self.bias is not None}"
+        )

stacked_linear-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: stacked-linear
+Version: 0.1.0
+Summary: Efficient implementation of stacked linear modules
+Project-URL: Documentation, https://stacked-linear.readthedocs.io/
+Project-URL: Homepage, https://github.com/moinfar/stacked-linear
+Project-URL: Source, https://github.com/moinfar/stacked-linear
+Author: Amir Ali Moinfar
+Maintainer-email: Amir Ali Moinfar <moinfar.amirali@gmail.com>
+License: BSD 3-Clause License
+        Copyright (c) 2026, Amir Ali Moinfar
+        All rights reserved.
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions are met:
+        1. Redistributions of source code must retain the above copyright notice, this
+           list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright notice,
+           this list of conditions and the following disclaimer in the documentation
+           and/or other materials provided with the distribution.
+        3. Neither the name of the copyright holder nor the names of its
+           contributors may be used to endorse or promote products derived from
+           this software without specific prior written permission.
+        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+        AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+        IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+        DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+        FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+        SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+        CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+        OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+        OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+License-File: LICENSE
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Requires-Python: >=3.11
+Requires-Dist: torch>=2
+Description-Content-Type: text/markdown
+# Parallel Stacked Linear Modules for PyTorch
+[![Tests][badge-tests]][tests]
+[![Documentation][badge-docs]][documentation]
+Efficient implementation of stacked linear modules in PyTorch, with support for output and stack subsetting.
+## Features
+- **`StackedLinearLayer`**: A parallelized linear layer that applies multiple independent transformations across different input stacks simultaneously. This is significantly more efficient than for loop over multiple `nn.Linear` layers. This is useful for specialized neural architectures like Additive Decoders.
+- **Subsetting Support**: Both layers allow for subsetting output features during the forward pass, and `StackedLinearLayer` additionally supports subsetting stacks.
+## Installation
+```bash
+pip install stacked-linear
+```
+Or install from source:
+```bash
+pip install git+https://github.com/moinfar/stacked-linear.git
+```
+## Quick Start
+### Linear Layer with Output Subsetting
+```python
+import torch
+from stacked_linear import LinearLayer
+# Initialize a layer (10 inputs, 5 outputs)
+layer = LinearLayer(10, 5)
+x = torch.randn(2, 10)
+# Forward pass on a subset of output features (indices 0, 2, and 4)
+subset = torch.tensor([0, 2, 4])
+output = layer(x, output_subset=subset)  # Shape: (2, 3)
+```
+### Stacked Linear Layer
+```python
+import torch
+from stacked_linear import StackedLinearLayer
+# 3 parallel stacks, each mapping 10 inputs to 5 outputs
+layer = StackedLinearLayer(n_stacks=3, in_features=10, out_features=5)
+x = torch.randn(2, 3, 10)  # (batch, stacks, features)
+# Efficient parallel forward pass
+output = layer(x)  # Shape: (2, 3, 5)
+# Forward pass on a subset of output features across all stacks
+subset = torch.tensor([1, 3])
+output_subset = layer(x, output_subset=subset)  # Shape: (2, 3, 2)
+# Forward pass on a subset of stacks
+stack_subset = torch.tensor([[0, 2], [1, 2]]) # Indices for each batch item
+x_subset = torch.randn(2, 2, 10)
+output_stack_subset = layer(x_subset, stack_subset=stack_subset) # Shape: (2, 2, 5)
+```
+[badge-tests]: https://img.shields.io/github/actions/workflow/status/moinfar/stacked-linear/test.yaml?branch=main
+[badge-docs]: https://img.shields.io/readthedocs/stacked-linear
+[tests]: https://github.com/moinfar/stacked-linear/actions/workflows/test.yaml
+[documentation]: https://stacked-linear.readthedocs.io
+[issue tracker]: https://github.com/moinfar/stacked-linear/issues

stacked_linear-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+stacked_linear/__init__.py,sha256=ArUfrXt67Z0KtW-09mVssPxewwYMm5jBeNekCSi28u8,140
+stacked_linear/linear_layer.py,sha256=VGiUYAxjWG2lKdWZOb2MDV_jey6YHHOkGACmLSU9fC8,1821
+stacked_linear/stacked_linear_layer.py,sha256=10TtOcRZWNrAINs5IjKO99cvbopQ9qBKJ3RHSEaKfnI,8210
+stacked_linear-0.1.0.dist-info/METADATA,sha256=ZFEzGuck2GFuv_xmcd-bNdPKcf0WzbsANGG_E_PEw8E,4921
+stacked_linear-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+stacked_linear-0.1.0.dist-info/licenses/LICENSE,sha256=wm36XbiogTgMEVdMfi7uGmVSINOYbPiSE1RIOhhII7U,1524
+stacked_linear-0.1.0.dist-info/RECORD,,

stacked_linear-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

stacked_linear-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,29 @@
+BSD 3-Clause License
+Copyright (c) 2026, Amir Ali Moinfar
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.