PyPI - hyper-connections - Versions diffs - 0.1.15__tar.gz → 0.2.1__tar.gz - Mend

hyper-connections 0.1.15tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hyper-connections
-Version: 0.1.15
+Version: 0.2.1
 Summary: Hyper-Connections
 Project-URL: Homepage, https://pypi.org/project/hyper-connections/
 Project-URL: Repository, https://github.com/lucidrains/hyper-connections
@@ -141,6 +141,12 @@ To compare hyper connections to plain residual without changing the code, just p
 get_init_and_expand_reduce_stream_functions(4, disable = True)
 ```
+To use the fractionated feature dimensions proposed in [a follow up paper](https://arxiv.org/abs/2503.14125) by same authors, just instantiate with `num_fracs` greater than `1` as so
+```python
+get_init_and_expand_reduce_stream_functions(1, num_fracs = 4) # also allows you to mix streams and fractions of feature dimension
+```
 ## Citation
 ```bibtex
@@ -160,3 +166,14 @@ get_init_and_expand_reduce_stream_functions(4, disable = True)
     url     = {https://medium.com/@ohadrubin/exploring-weight-decay-in-layer-normalization-challenges-and-a-reparameterization-solution-ad4d12c24950}
 }
 ```
+```bibtex
+@article{Zhu2025FracConnectionsFE,
+    title   = {Frac-Connections: Fractional Extension of Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Jundong Zhou and Zihao Huang and Yutao Zeng and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2025},
+    volume  = {abs/2503.14125},
+    url     = {https://api.semanticscholar.org/CorpusID:277104144}
+}
+```

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/README.md RENAMED Viewed

@@ -100,6 +100,12 @@ To compare hyper connections to plain residual without changing the code, just p
 get_init_and_expand_reduce_stream_functions(4, disable = True)
 ```
+To use the fractionated feature dimensions proposed in [a follow up paper](https://arxiv.org/abs/2503.14125) by same authors, just instantiate with `num_fracs` greater than `1` as so
+```python
+get_init_and_expand_reduce_stream_functions(1, num_fracs = 4) # also allows you to mix streams and fractions of feature dimension
+```
 ## Citation
 ```bibtex
@@ -119,3 +125,14 @@ get_init_and_expand_reduce_stream_functions(4, disable = True)
     url     = {https://medium.com/@ohadrubin/exploring-weight-decay-in-layer-normalization-challenges-and-a-reparameterization-solution-ad4d12c24950}
 }
 ```
+```bibtex
+@article{Zhu2025FracConnectionsFE,
+    title   = {Frac-Connections: Fractional Extension of Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Jundong Zhou and Zihao Huang and Yutao Zeng and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2025},
+    volume  = {abs/2503.14125},
+    url     = {https://api.semanticscholar.org/CorpusID:277104144}
+}
+```

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/hyper_connections.py RENAMED Viewed

@@ -5,13 +5,13 @@ from functools import partial
 from random import randrange
 import torch
-from torch import nn
-from torch.nn import Module
+from torch import nn, cat
 import torch.nn.functional as F
+from torch.nn import Module, Sequential
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from einops import rearrange, repeat, reduce, einsum
-from einops.layers.torch import Reduce
+from einops.layers.torch import Rearrange, Reduce
 """
 ein notation:
@@ -19,6 +19,7 @@ b - batch
 d - feature dimension
 s - residual streams
 t - residual streams + num branch inputs
+f - number of fractions (division of feature dimension space)
 v - number of views for branch input
 """
@@ -27,6 +28,9 @@ v - number of views for branch input
 def exists(v):
     return v is not None
+def divisible_by(num, den):
+    return (num % den) == 0
 def default(v, d):
     return v if exists(v) else d
@@ -38,8 +42,12 @@ def add(x, y):
 # main functions
-def get_expand_reduce_stream_functions(num_streams, add_stream_embed = False, dim = None, disable = False):
+def get_expand_reduce_stream_functions(
+    num_streams,
+    add_stream_embed = False,
+    dim = None,
+    disable = False
+):
     if num_streams == 1 or disable:
         return (nn.Identity(), nn.Identity())
@@ -54,11 +62,18 @@ def get_expand_reduce_stream_functions(num_streams, add_stream_embed = False, di
     return expand_fn, reduce_fn
-def get_init_and_expand_reduce_stream_functions(num_streams, dim = None, add_stream_embed = False, disable = False):
+def get_init_and_expand_reduce_stream_functions(
+    num_streams,
+    num_fracs = 1,
+    dim = None,
+    add_stream_embed = False,
+    disable = None
+):
+    disable = default(disable, num_streams == 1 and num_fracs == 1)
     hyper_conn_klass = HyperConnections if not disable else Residual
-    init_hyper_conn_fn = partial(hyper_conn_klass, num_streams)
+    init_hyper_conn_fn = partial(hyper_conn_klass, num_streams, num_fracs = num_fracs)
     expand_reduce_fns = get_expand_reduce_stream_functions(num_streams, add_stream_embed = add_stream_embed, dim = dim, disable = disable)
     if exists(dim):
@@ -93,13 +108,24 @@ class Residual(Module):
         self.branch = branch
         self.residual_transform = default(residual_transform, nn.Identity())
-    def width_connection(self, residuals):
+    def width_connection(
+        self,
+        residuals
+    ):
         return residuals, residuals, dict()
-    def depth_connection(self, branch_output, residuals):
+    def depth_connection(
+        self,
+        branch_output,
+        residuals,
+    ):
         return branch_output + self.residual_transform(residuals)
-    def decorate_branch(self, branch: Callable):
+    def decorate_branch(
+        self,
+        branch: Callable
+    ):
         assert not exists(self.branch), 'branch was already wrapped on init'
         def forward_and_add_residual(residual, *args, **kwargs):
@@ -113,7 +139,12 @@ class Residual(Module):
         return forward_and_add_residual
-    def forward(self, residuals, *branch_args, **branch_kwargs):
+    def forward(
+        self,
+        residuals,
+        *branch_args,
+        **branch_kwargs
+    ):
         branch_input, residuals, residual_kwargs = self.width_connection(residuals)
@@ -145,9 +176,10 @@ class HyperConnections(Module):
         channel_first = False,
         dropout = 0.,
         residual_transform: Module | None = None, # to support resnet blocks where dimension in not equal to dimension out - usually a residual conv
-        add_branch_out_to_residual = True, # will disable depth connections (weighted residual sum with beta) if set False
-        num_input_views = 1, # allow for the branch module to receive multiple input views, dimension placed on the very left (before batch)
-        depth_residual_fn = add
+        add_branch_out_to_residual = True,  # will disable depth connections (weighted residual sum with beta) if set False
+        num_input_views = 1,                # allow for the branch module to receive multiple input views, dimension placed on the very left (before batch)
+        depth_residual_fn = add,
+        num_fracs = 1                       # https://arxiv.org/abs/2503.14125
     ):
         """
         Appendix J, Algorithm2 in - https://arxiv.org/abs/2409.19606
@@ -160,13 +192,34 @@ class HyperConnections(Module):
         self.act = nn.Tanh() if tanh else nn.Identity()
-        self.norm = RMSNorm(dim) # they used layernorm in paper, but rmsnorm is fine given what we know now
+        # frac-connections paper - num_fracs > 1 will be the `m` in their paper https://arxiv.org/abs/2503.14125
+        assert num_fracs >= 1
+        self.num_fracs = num_fracs
+        self.has_fracs = num_fracs > 1
+        self.split_fracs = Rearrange('b ... (f d) -> b ... f d', f = num_fracs)
+        self.merge_fracs = Rearrange('b ... f d -> b ... (f d)')
+        assert divisible_by(dim, num_fracs), f'feature dimension ({dim}) must be divisible by the `num_fracs` ({num_fracs})'
+        dim //= num_fracs # effective dim handled in dimension is feature dimension divided by num fractions
+        # they used layernorm in paper, but rmsnorm is fine given what we know now
+        self.norm = RMSNorm(dim)
         assert num_residual_streams > 0, '`num_residual_streams` must be greater than 0'
         self.num_residual_streams = num_residual_streams
         init_residual_index = default(layer_index, randrange(num_residual_streams)) % num_residual_streams # just choose one random residual stream if layer index not given
+        # handle the parameter dimensions, which may require (num_residuals x num_fractions) - generalizing hyper + frac connections
+        num_residual_streams_fracs = num_residual_streams * num_fracs
+        num_input_views_fracs = num_input_views * num_fracs
         # width num residual streams
         assert num_input_views >= 1
@@ -174,12 +227,12 @@ class HyperConnections(Module):
         # width connection
-        init_alpha0 = torch.zeros((num_residual_streams, num_input_views))
+        init_alpha0 = torch.zeros((num_residual_streams_fracs, num_input_views_fracs))
         init_alpha0[init_residual_index, :] = 1.
-        self.static_alpha = nn.Parameter(torch.cat([init_alpha0, torch.eye(num_residual_streams)], dim = 1))
+        self.static_alpha = nn.Parameter(cat((init_alpha0, torch.eye(num_residual_streams_fracs)), dim = 1))
-        self.dynamic_alpha_fn = nn.Parameter(torch.zeros(dim, num_residual_streams + num_input_views))
+        self.dynamic_alpha_fn = nn.Parameter(torch.zeros(dim, num_residual_streams_fracs + num_input_views_fracs))
         self.dynamic_alpha_scale = nn.Parameter(torch.ones(()) * 1e-2)
         # depth connection related (beta)
@@ -187,8 +240,11 @@ class HyperConnections(Module):
         self.add_branch_out_to_residual = add_branch_out_to_residual
         if add_branch_out_to_residual:
-            self.static_beta = nn.Parameter(torch.ones(num_residual_streams))
-            self.dynamic_beta_fn = nn.Parameter(torch.zeros(dim))
+            self.static_beta = nn.Parameter(torch.ones(num_residual_streams_fracs))
+            dynamic_beta_shape = (dim,) if num_fracs == 1 else (dim, num_fracs) # preserve backwards compat
+            self.dynamic_beta_fn = nn.Parameter(torch.zeros(dynamic_beta_shape))
             self.dynamic_beta_scale = nn.Parameter(torch.ones(()) * 1e-2)
         # dropouts
@@ -209,16 +265,30 @@ class HyperConnections(Module):
         self.depth_residual_fn = depth_residual_fn
-    def width_connection(self, residuals):
+    def width_connection(
+        self,
+        residuals
+    ):
+        streams = self.num_residual_streams
         maybe_transformed_residuals = self.residual_transform(residuals)
         # width connection
+        # handle channel first
         if self.channel_first:
             residuals = rearrange(residuals, 'b d ... -> b ... d')
-        residuals = rearrange(residuals, '(b s) ... d -> b ... s d', s = self.num_residual_streams)
+        # split out fractions
+        residuals = self.split_fracs(residuals)
+        # split out streams
+        residuals = rearrange(residuals, '(b s) ... d -> b ... s d', s = streams)
+        # norm
         normed = self.norm(residuals)
@@ -226,7 +296,12 @@ class HyperConnections(Module):
         wc_weight = self.act(normed @ self.dynamic_alpha_fn)
         dynamic_alpha = wc_weight * self.dynamic_alpha_scale
-        alpha = dynamic_alpha + self.static_alpha
+        static_alpha = rearrange(self.static_alpha, '(f s) d -> f s d', s = streams)
+        alpha = dynamic_alpha + static_alpha
+        alpha = self.split_fracs(alpha) # (batch, seq, fracs1, streams, fracs2, input + residual streams)
         # beta for weights from branch output back to residual streams
@@ -234,10 +309,17 @@ class HyperConnections(Module):
         if self.add_branch_out_to_residual:
             dc_weight = self.act(normed @ self.dynamic_beta_fn)
+            if not self.has_fracs:
+                dc_weight = rearrange(dc_weight, '... -> ... 1')
             dynamic_beta = dc_weight * self.dynamic_beta_scale
-            beta = dynamic_beta + self.static_beta
-        mix_h = einsum(alpha, residuals, '... s t, ... s d -> ... t d')
+            static_beta = rearrange(self.static_beta, '... (s f) -> ... s f', s = streams)
+            beta = dynamic_beta + static_beta
+        mix_h = einsum(alpha, residuals, '... f1 s f2 t, ... f1 s d -> ... f2 t d')
         if self.num_input_views == 1:
             branch_input, residuals = mix_h[..., 0, :], mix_h[..., 1:, :]
@@ -248,19 +330,40 @@ class HyperConnections(Module):
         if self.channel_first:
             branch_input = rearrange(branch_input, 'b ... d -> b d ...')
+        # maybe merge fractions back
+        branch_input = self.merge_fracs(branch_input)
         return branch_input, maybe_transformed_residuals, dict(beta = beta)
-    def depth_connection(self, branch_output, residuals, *, beta):
+    def depth_connection(
+        self,
+        branch_output,
+        residuals,
+        *,
+        beta
+    ):
         assert self.add_branch_out_to_residual
+        # maybe split fractions
+        branch_output = self.split_fracs(branch_output)
         # 'depth' connection
         if self.channel_first:
             branch_output = rearrange(branch_output, 'b d ... -> b ... d')
-        output = einsum(branch_output, beta, 'b ... d, b ... s -> b ... s d')
+        output = einsum(branch_output, beta, 'b ... f1 d, b ... f1 s f2 -> b ... f2 s d')
         output = rearrange(output, 'b ... s d -> (b s) ... d')
+        # merge merge back fractions
+        output = self.merge_fracs(output)
+        # channel first
         if self.channel_first:
             output = rearrange(output, 'b ... d -> b d ...')
@@ -268,7 +371,10 @@ class HyperConnections(Module):
         return self.dropout(residuals)
-    def decorate_branch(self, branch: Callable):
+    def decorate_branch(
+        self,
+        branch: Callable
+    ):
         assert not exists(self.branch), 'branch was already wrapped on init'
         def forward_and_add_residual(residual, *args, **kwargs):
@@ -282,7 +388,12 @@ class HyperConnections(Module):
         return forward_and_add_residual
-    def forward(self, residuals, *branch_args, **branch_kwargs):
+    def forward(
+        self,
+        residuals,
+        *branch_args,
+        **branch_kwargs
+    ):
         branch_input, residuals, residual_kwargs = self.width_connection(residuals)

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/hyper_connections_channel_first.py RENAMED Viewed

@@ -49,7 +49,9 @@ def get_expand_reduce_stream_functions(num_streams, disable = False):
     return expand_fn, reduce_fn
-def get_init_and_expand_reduce_stream_functions(num_streams, disable = False):
+def get_init_and_expand_reduce_stream_functions(num_streams, disable = None):
+    disable = default(disable, num_streams == 1)
     hyper_conn_klass = HyperConnections if not disable else Residual

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/hyper_connections_with_multi_branch_inputs.py RENAMED Viewed

@@ -50,7 +50,9 @@ def get_expand_reduce_stream_functions(cls, num_streams, disable = False):
     return expand_fn, reduce_fn
-def get_init_and_expand_reduce_stream_functions(cls, num_streams, disable = False):
+def get_init_and_expand_reduce_stream_functions(cls, num_streams, disable = None):
+    disable = default(disable, num_streams == 1)
     hyper_conn_klass = HyperConnections if not disable else Residual

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/hyper_connections_with_multi_input_streams.py RENAMED Viewed

@@ -41,7 +41,9 @@ def get_expand_reduce_stream_functions(num_streams, disable = False):
     return expand_fn, reduce_fn
-def get_init_and_expand_reduce_stream_functions(num_streams, disable = False):
+def get_init_and_expand_reduce_stream_functions(num_streams, disable = None):
+    disable = default(disable, num_streams == 1)
     hyper_conn_klass = HyperConnections if not disable else Residual

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hyper-connections"
-version = "0.1.15"
+version = "0.2.1"
 description = "Hyper-Connections"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/tests/test_hyper_connections.py RENAMED Viewed

@@ -3,8 +3,12 @@ import pytest
 import torch
 from torch import nn
+@pytest.mark.parametrize('num_fracs', (1, 4))
 @pytest.mark.parametrize('disable', (False, True))
-def test_readme(disable):
+def test_readme(
+    num_fracs,
+    disable
+):
     # a single branch layer
@@ -20,7 +24,7 @@ def test_readme(disable):
     from hyper_connections import get_init_and_expand_reduce_stream_functions
-    init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4, disable = disable)
+    init_hyper_conn, expand_stream, reduce_stream = get_init_and_expand_reduce_stream_functions(4, num_fracs = num_fracs, disable = disable)
     # 1. wrap your branch function

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/.github/workflows/test.yml RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/.gitignore RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/LICENSE RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper-connections.png RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/__init__.py RENAMED Viewed

File without changes

{hyper_connections-0.1.15 → hyper_connections-0.2.1}/hyper_connections/residuals.py RENAMED Viewed

File without changes

hyper-connections 0.1.15__tar.gz → 0.2.1__tar.gz

hyper-connections 0.1.15tar.gz → 0.2.1tar.gz