PyPI - hyper-connections - Versions diffs - 0.0.21__tar.gz → 0.0.23__tar.gz - Mend

hyper-connections 0.0.21tar.gz → 0.0.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hyper-connections
-Version: 0.0.21
+Version: 0.0.23
 Summary: Hyper-Connections
 Project-URL: Homepage, https://pypi.org/project/hyper-connections/
 Project-URL: Repository, https://github.com/lucidrains/hyper-connections
@@ -34,6 +34,7 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
+Requires-Dist: beartype
 Requires-Dist: einops>=0.8.0
 Requires-Dist: torch>=2.3
 Provides-Extra: examples
@@ -71,9 +72,9 @@ residual = branch(residual) + residual
 # after, say 4 streams in paper
-from hyper_connections import HyperConnections
+from hyper_connections import get_init_and_expand_reduce_stream_functions
-init_hyper_conn, expand_stream, reduce_stream = HyperConnections.get_init_and_expand_reduce_stream_functions(4)
+init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4)
 # 1. wrap your branch function
@@ -110,9 +111,9 @@ residual = branch(residual) + residual
 # after, say 4 streams in paper
-from hyper_connections import HyperConnections
+from hyper_connections import get_init_and_expand_reduce_stream_functions
-init_hyper_conn, expand_stream, reduce_stream = HyperConnections.get_init_and_expand_reduce_stream_functions(4)
+init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4)
 # 1. instantiate hyper connection with correct number of streams (4 in this case) - or use the init function above
@@ -140,7 +141,7 @@ residual = reduce_stream(residual)
 To compare hyper connections to plain residual without changing the code, just pass `disable = True` when fetching the functions
 ```python
-HyperConnections.get_init_and_expand_reduce_stream_functions(4, disable = True)
+get_init_and_expand_reduce_stream_functions(4, disable = True)
 ```
 ## Citation

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/README.md RENAMED Viewed

@@ -28,9 +28,9 @@ residual = branch(residual) + residual
 # after, say 4 streams in paper
-from hyper_connections import HyperConnections
+from hyper_connections import get_init_and_expand_reduce_stream_functions
-init_hyper_conn, expand_stream, reduce_stream = HyperConnections.get_init_and_expand_reduce_stream_functions(4)
+init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4)
 # 1. wrap your branch function
@@ -67,9 +67,9 @@ residual = branch(residual) + residual
 # after, say 4 streams in paper
-from hyper_connections import HyperConnections
+from hyper_connections import get_init_and_expand_reduce_stream_functions
-init_hyper_conn, expand_stream, reduce_stream = HyperConnections.get_init_and_expand_reduce_stream_functions(4)
+init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4)
 # 1. instantiate hyper connection with correct number of streams (4 in this case) - or use the init function above
@@ -97,7 +97,7 @@ residual = reduce_stream(residual)
 To compare hyper connections to plain residual without changing the code, just pass `disable = True` when fetching the functions
 ```python
-HyperConnections.get_init_and_expand_reduce_stream_functions(4, disable = True)
+get_init_and_expand_reduce_stream_functions(4, disable = True)
 ```
 ## Citation

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/hyper_connections/__init__.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from hyper_connections.hyper_connections import (
     HyperConnections,
+    get_expand_reduce_stream_functions,
+    get_init_and_expand_reduce_stream_functions,
     Residual,
     StreamEmbed,
     AttentionPoolReduceStream

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/hyper_connections/hyper_connections.py RENAMED Viewed

@@ -12,6 +12,8 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from einops import rearrange, repeat, reduce, einsum
+from beartype import beartype
 """
 ein notation:
 b - batch
@@ -31,6 +33,27 @@ def default(v, d):
 def identity(t):
     return t
+# main functions
+def get_expand_reduce_stream_functions(num_streams, disable = False):
+    if disable:
+        return (identity, identity)
+    expand_fn = partial(repeat, pattern = 'b ... -> (b s) ...', s = num_streams)
+    reduce_fn = partial(reduce, pattern = '(b s) ... -> b ...', reduction = 'sum', s = num_streams)
+    return expand_fn, reduce_fn
+def get_init_and_expand_reduce_stream_functions(num_streams, disable = False):
+    hyper_conn_klass = HyperConnections if not disable else Residual
+    init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
+    expand_reduce_fns = get_expand_reduce_stream_functions(num_streams, disable = disable)
+    return (init_hyper_conn_fn, *expand_reduce_fns)
 # norms
 class RMSNorm(Module):
@@ -47,10 +70,11 @@ class RMSNorm(Module):
 # residual base class
 class Residual(Module):
+    @beartype
     def __init__(
         self,
         *args,
-        branch = None,
+        branch: Module | None = None,
         **kwargs
     ):
         super().__init__()
@@ -97,6 +121,7 @@ class Residual(Module):
 # hyper connection residual streams
 class HyperConnections(Module):
+    @beartype
     def __init__(
         self,
         num_residual_streams,
@@ -146,27 +171,6 @@ class HyperConnections(Module):
         self.channel_first = channel_first
-    @classmethod
-    def get_expand_reduce_stream_functions(cls, num_streams, disable = False):
-        if disable:
-            return (identity, identity)
-        expand_fn = partial(repeat, pattern = 'b ... -> (b s) ...', s = num_streams)
-        reduce_fn = partial(reduce, pattern = '(b s) ... -> b ...', reduction = 'sum', s = num_streams)
-        return expand_fn, reduce_fn
-    @classmethod
-    def get_init_and_expand_reduce_stream_functions(cls, num_streams, disable = False):
-        hyper_conn_klass = cls if not disable else Residual
-        init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
-        expand_reduce_fns = cls.get_expand_reduce_stream_functions(num_streams, disable = disable)
-        return (init_hyper_conn_fn, *expand_reduce_fns)
     def width_connection(self, residuals):
         # width connection
@@ -244,6 +248,9 @@ class HyperConnections(Module):
         return add_residual_fn(branch_output)
+HyperConnections.get_expand_reduce_stream_functions = staticmethod(get_expand_reduce_stream_functions)
+HyperConnections.get_init_and_expand_reduce_stream_functions = staticmethod(get_init_and_expand_reduce_stream_functions)
 # stream embed
 class StreamEmbed(Module):

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/hyper_connections/hyper_connections_with_multi_branch_inputs.py RENAMED Viewed

@@ -12,6 +12,8 @@ from torch.utils._pytree import tree_flatten, tree_unflatten
 from einops import rearrange, repeat, reduce, einsum
+from beartype import beartype
 """
 ein notation:
 b - batch
@@ -38,11 +40,32 @@ def divisible_by(num, den):
 def identity(t):
     return t
+# main functions
+def get_expand_reduce_stream_functions(cls, num_streams, disable = False):
+    if disable:
+        return (identity, identity)
+    expand_fn = partial(repeat, pattern = 'b ... -> (b s) ...', s = num_streams)
+    reduce_fn = partial(reduce, pattern = '(b s) ... -> b ...', reduction = 'sum', s = num_streams)
+    return expand_fn, reduce_fn
+def get_init_and_expand_reduce_stream_functions(cls, num_streams, disable = False):
+    hyper_conn_klass = HyperConnections if not disable else Residual
+    init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
+    expand_reduce_fns = get_expand_reduce_stream_functions(num_streams, disable = disable)
+    return (init_hyper_conn_fn, *expand_reduce_fns)
 # main classes
 # hyper connection residual streams
 class HyperConnections(Module):
+    @beartype
     def __init__(
         self,
         num_residual_streams,
@@ -108,26 +131,6 @@ class HyperConnections(Module):
         self.channel_first = channel_first
-    @classmethod
-    def get_expand_reduce_stream_functions(cls, num_streams, disable = False):
-        if disable:
-            return (identity, identity)
-        expand_fn = partial(repeat, pattern = 'b ... -> (b s) ...', s = num_streams)
-        reduce_fn = partial(reduce, pattern = '(b s) ... -> b ...', reduction = 'sum', s = num_streams)
-        return expand_fn, reduce_fn
-    @classmethod
-    def get_init_and_expand_reduce_stream_functions(cls, num_streams, disable = False):
-        hyper_conn_klass = cls if not disable else Residual
-        init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
-        expand_reduce_fns = cls.get_expand_reduce_stream_functions(num_streams, disable = disable)
-        return (init_hyper_conn_fn, *expand_reduce_fns)
     def width_connection(self, residuals):
         num_streams, num_branch_inputs = self.num_residual_streams, self.num_branch_inputs
@@ -225,3 +228,6 @@ class HyperConnections(Module):
         branch_output = torch.cat(branch_outputs)
         return add_residual_fn(branch_output)
+HyperConnections.get_expand_reduce_stream_functions = staticmethod(get_expand_reduce_stream_functions)
+HyperConnections.get_init_and_expand_reduce_stream_functions = staticmethod(get_init_and_expand_reduce_stream_functions)

hyper_connections-0.0.23/hyper_connections/hyper_connections_with_multi_input_streams.py ADDED Viewed

@@ -0,0 +1,338 @@
+from __future__ import annotations
+from typing import Callable
+from functools import partial
+from random import randrange
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Module, ModuleList
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from einops import rearrange, repeat, reduce, einsum
+from einops.layers.torch import Rearrange
+from beartype import beartype
+"""
+ein notation:
+b - batch
+d - feature dimension
+s - residual streams
+t - residual streams + num branch inputs
+"""
+# helper functions
+def exists(v):
+    return v is not None
+def default(v, d):
+    return v if exists(v) else d
+def identity(t):
+    return t
+# main functions
+def get_expand_reduce_stream_functions(num_streams, disable = False):
+    if disable:
+        return (identity, identity)
+    expand_fn = partial(repeat, pattern = 'b ... -> (b s) ...', s = num_streams)
+    reduce_fn = partial(reduce, pattern = '(b s) ... -> b ...', reduction = 'sum', s = num_streams)
+    return expand_fn, reduce_fn
+def get_init_and_expand_reduce_stream_functions(num_streams, disable = False):
+    hyper_conn_klass = HyperConnections if not disable else Residual
+    init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
+    expand_reduce_fns = get_expand_reduce_stream_functions(num_streams, disable = disable)
+    return (init_hyper_conn_fn, *expand_reduce_fns)
+# norms
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.zeros(dim))
+    def forward(self, x):
+        return F.normalize(x, dim = -1) * self.scale * (self.gamma + 1)
+class ProjActScale(Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        activation: Module = nn.Identity(),
+        scale_init: float = 1e-2,
+        squeeze_output = False
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.proj = nn.Linear(dim, dim_out, bias = False)
+        nn.init.zeros_(self.proj.weight)
+        self.act = activation
+        self.scale = nn.Parameter(torch.ones(()) * scale_init)
+        self.maybe_squeeze = Rearrange('... 1 -> ...') if squeeze_output else nn.Identity()
+    def forward(self, x):
+        out = self.proj(x)
+        out = self.act(out)
+        return self.maybe_squeeze(out * self.scale)
+# main classes
+# residual base class
+class Residual(Module):
+    @beartype
+    def __init__(
+        self,
+        *args,
+        branch: Module | None = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.branch = branch
+    def width_connection(self, residuals, *args, **kwargs):
+        return residuals, residuals, dict()
+    def depth_connection(self, branch_output, residuals):
+        return branch_output + residuals
+    def decorate_branch(self, branch: Callable):
+        assert not exists(self.branch), 'branch was already wrapped on init'
+        def forward_and_add_residual(residual, *args, **kwargs):
+            branch_input, add_residual = self.forward(residual, *args, **kwargs)
+            branch_output = branch(branch_input, *args, **kwargs)
+            residual = add_residual(branch_output)
+            return residual
+        return forward_and_add_residual
+    def forward(self, residuals, *branch_args, **branch_kwargs):
+        branch_input, residuals, residual_kwargs = self.width_connection(residuals, *branch_args, **branch_kwargs)
+        def add_residual_fn(branch_out):
+            (branch_out, *rest), tree_spec = tree_flatten(branch_out)
+            branch_out = self.depth_connection(branch_out, residuals, **residual_kwargs)
+            return tree_unflatten((branch_out, *rest), tree_spec)
+        if not exists(self.branch):
+            return branch_input, add_residual_fn
+        branch_output = self.branch(branch_input, *branch_args, **branch_kwargs)
+        return add_residual_fn(branch_output)
+# hyper connection with multiple input streams
+InputPathType = int | str  # the path to the second residual stream, where `int` points to *args[`int` + 1] and `str` points to **kwargs[`str`]
+class HyperConnections(Module):
+    @beartype
+    def __init__(
+        self,
+        num_residual_streams,
+        *,
+        dim,
+        additional_input_paths: (
+            list[InputPathType |
+            tuple[InputPathType, int]] # if the second residual has different dimensions, second tuple element is the dimension
+            | None
+        ) = None,
+        branch: Module | None = None,
+        layer_index = None,
+        tanh = True,
+        channel_first = False,
+        dropout = 0.
+    ):
+        """
+        Appendix J, Algorithm2 in - https://arxiv.org/abs/2409.19606
+        """
+        super().__init__()
+        self.branch = branch
+        act = nn.Tanh() if tanh else nn.Identity()
+        self.num_residual_streams = num_residual_streams
+        assert num_residual_streams > 0, '`num_residual_streams` must be greater than 0'
+        # activation, seemingly results were wishy washy depending on using tanh or not
+        self.norm = RMSNorm(dim) # they used layernorm in paper, but rmsnorm is fine given what we know now
+        init_residual_index = default(layer_index, randrange(num_residual_streams)) % num_residual_streams # just choose one random residual stream if layer index not given
+        init_alpha0 = torch.zeros((num_residual_streams, 1))
+        init_alpha0[init_residual_index, 0] = 1.
+        self.dynamic_alpha_and_branch_input = ProjActScale(dim, num_residual_streams + 1)
+        self.static_alpha = nn.Parameter(torch.cat([init_alpha0, torch.eye(num_residual_streams)], dim = 1))
+        self.dynamic_beta = ProjActScale(dim, 1, activation = act, squeeze_output = True)
+        self.static_beta = nn.Parameter(torch.ones(num_residual_streams))
+        # additional input residual streams
+        additional_input_paths = default(additional_input_paths, [])
+        additional_input_paths = [one_path if isinstance(one_path, tuple) else (one_path, dim) for one_path in additional_input_paths]
+        self.additional_norms = ModuleList([RMSNorm(dim) for _, dim in additional_input_paths])
+        self.additional_to_dynamic_input = ModuleList([ProjActScale(dim, 1, activation = act, squeeze_output = True) for _ , dim in additional_input_paths])
+        self.additional_static_input = nn.ParameterList([nn.Parameter(init_alpha0[..., 0])])
+        self.additional_input_paths = additional_input_paths
+        # dropouts
+        self.dropout = nn.Dropout(dropout)
+        # channel first option
+        self.channel_first = channel_first
+    def width_connection(
+        self,
+        residuals,
+        *branch_args,
+        **branch_kwargs
+    ):
+        transpose = self.channel_first
+        # width connection
+        if transpose:
+            residuals = rearrange(residuals, 'b d ... -> b ... d')
+        residuals = rearrange(residuals, '(b s) ... d -> b ... s d', s = self.num_residual_streams)
+        normed = self.norm(residuals)
+        # alpha for weighted sum of residuals going into branch
+        dynamic_alpha = self.dynamic_alpha_and_branch_input(normed)
+        alpha = dynamic_alpha + self.static_alpha
+        # beta for weights from branch output back to residual streams
+        dynamic_beta = self.dynamic_beta(normed)
+        beta = dynamic_beta + self.static_beta
+        mix_h = einsum(alpha, residuals, '... s t, ... s d -> ... t d')
+        branch_input, residuals = mix_h[..., 0, :], mix_h[..., 1:, :]
+        if transpose:
+            branch_input = rearrange(branch_input, 'b ... d -> b d ...')
+        # take care of additional inputs
+        for (path, *_), norm, proj, learned_static in zip(self.additional_input_paths, self.additional_norms, self.additional_to_dynamic_input, self.additional_static_input):
+            # get the residual streams from additional arguments
+            if isinstance(path, int):
+                additional_residuals = branch_args[path]
+            elif isinstance(path, str):
+                additional_residuals = branch_kwargs[path]
+            assert torch.is_tensor(additional_residuals)
+            # handle channel first
+            if transpose:
+                additional_residuals = rearrange('b d ... -> b ... d')
+            additional_residuals = rearrange(additional_residuals, '(b s) ... d -> b ... s d', s = self.num_residual_streams)
+            # norm
+            additional_mix = proj(norm(additional_residuals))
+            additional_mix = additional_mix + learned_static
+            additional_residuals = einsum(additional_mix, additional_residuals, '... s, ... s d -> ... d')
+            # transpose out
+            if transpose:
+                additional_residuals = rearrange('b ... d -> b d ...')
+            # set back transformed residual
+            if isinstance(path, int):
+                branch_args[path] = additional_residuals
+            elif isinstance(path, str):
+                branch_kwargs[path] = additional_residuals
+        return ([branch_input, *branch_args], branch_kwargs), residuals, dict(beta = beta)
+    def depth_connection(self, branch_output, residuals, *, beta):
+        # 'depth' connection
+        if self.channel_first:
+            branch_output = rearrange(branch_output, 'b d ... -> b ... d')
+        residuals = einsum(branch_output, beta, 'b ... d, b ... s -> b ... s d') + residuals
+        output = rearrange(residuals, 'b ... s d -> (b s) ... d')
+        if self.channel_first:
+            output = rearrange(output, 'b ... d -> b d ...')
+        return self.dropout(output)
+    def decorate_branch(self, branch: Callable):
+        assert not exists(self.branch), 'branch was already wrapped on init'
+        def forward_and_add_residual(residual, *args, **kwargs):
+            ([branch_input, *args], kwargs), add_residual = self.forward(residual, *args, **kwargs)
+            branch_output = branch(branch_input, *args, **kwargs)
+            residual = add_residual(branch_output)
+            return residual
+        return forward_and_add_residual
+    def forward(self, residuals, *branch_args, **branch_kwargs):
+        (branch_args, branch_kwargs), residuals, residual_kwargs = self.width_connection(residuals, *branch_args, **branch_kwargs)
+        def add_residual_fn(branch_out):
+            (branch_out, *rest), tree_spec = tree_flatten(branch_out)
+            branch_out = self.depth_connection(branch_out, residuals, **residual_kwargs)
+            return tree_unflatten((branch_out, *rest), tree_spec)
+        if not exists(self.branch):
+            return (branch_args, branch_kwargs), add_residual_fn
+        branch_output = self.branch(*branch_args, **branch_kwargs)
+        return add_residual_fn(branch_output)
+# add static methods
+HyperConnections.get_expand_reduce_stream_functions = staticmethod(get_expand_reduce_stream_functions)
+HyperConnections.get_init_and_expand_reduce_stream_functions = staticmethod(get_init_and_expand_reduce_stream_functions)

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hyper-connections"
-version = "0.0.21"
+version = "0.0.23"
 description = "Hyper-Connections"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -23,6 +23,7 @@ classifiers=[
 ]
 dependencies = [
+    "beartype",
     "einops>=0.8.0",
     "torch>=2.3",
 ]

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/.gitignore RENAMED Viewed

File without changes

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/LICENSE RENAMED Viewed

File without changes

{hyper_connections-0.0.21 → hyper_connections-0.0.23}/hyper-connections.png RENAMED Viewed

File without changes

hyper-connections 0.0.21__tar.gz → 0.0.23__tar.gz

hyper-connections 0.0.21tar.gz → 0.0.23tar.gz