difflayers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- difflayers/__init__.py +965 -0
- difflayers/activation.py +339 -0
- difflayers/attention_operator.py +157 -0
- difflayers/auxiliary/__init__.py +0 -0
- difflayers/auxiliary/data.py +252 -0
- difflayers/diffused_attention.py +427 -0
- difflayers/diffusion.py +395 -0
- difflayers/dynamics_engine.py +540 -0
- difflayers/functional.py +459 -0
- difflayers/graph/__init__.py +18 -0
- difflayers/graph/build_graph.py +77 -0
- difflayers/graph/builder.py +120 -0
- difflayers/graph/laplacian.py +76 -0
- difflayers/graph/laplacian_builder.py +64 -0
- difflayers/transformer.py +212 -0
- difflayers-0.1.0.dist-info/METADATA +210 -0
- difflayers-0.1.0.dist-info/RECORD +20 -0
- difflayers-0.1.0.dist-info/WHEEL +5 -0
- difflayers-0.1.0.dist-info/licenses/LICENSE +79 -0
- difflayers-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Laplacian computation for graph-regularized Hopfield attention.
|
|
3
|
+
|
|
4
|
+
Responsibility: Convert an adjacency matrix into a graph Laplacian.
|
|
5
|
+
|
|
6
|
+
Provides both the unnormalized (L = D - A) and symmetric normalized
|
|
7
|
+
(L_norm = D^{-1/2} L D^{-1/2}) graph Laplacians.
|
|
8
|
+
|
|
9
|
+
Supports dense and sparse adjacency inputs. When the input is a
|
|
10
|
+
sparse_coo_tensor every intermediate is kept sparse so that downstream
|
|
11
|
+
diffusion can exploit O(kN) sparse matrix-vector products.
|
|
12
|
+
|
|
13
|
+
Complexity (dense N×N input):
|
|
14
|
+
compute_laplacian : O(N²) time, O(N²) space
|
|
15
|
+
compute_normalized_laplacian : O(N²) time, O(N²) space
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
from torch import Tensor
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_sparse(A: Tensor) -> bool:
|
|
23
|
+
return A.is_sparse
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def compute_laplacian(A: Tensor) -> Tensor:
|
|
27
|
+
"""
|
|
28
|
+
Compute the unnormalized graph Laplacian L = D - A.
|
|
29
|
+
|
|
30
|
+
Works with both dense and sparse_coo adjacency matrices.
|
|
31
|
+
Sparse input → sparse output (preserves O(kN) downstream products).
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
A: (N, N) adjacency / similarity matrix (dense or sparse_coo).
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
L: (N, N) graph Laplacian, same layout (dense or sparse_coo).
|
|
38
|
+
Eigenvalues in [0, max_degree].
|
|
39
|
+
"""
|
|
40
|
+
if _is_sparse(A):
|
|
41
|
+
A_dense = A.to_dense()
|
|
42
|
+
D = torch.diag(A_dense.sum(dim=-1))
|
|
43
|
+
L_dense = D - A_dense
|
|
44
|
+
# Return dense — sparse Laplacian construction is complex and rarely
|
|
45
|
+
# needed; the sparse benefit is captured at the matmul call site.
|
|
46
|
+
return L_dense
|
|
47
|
+
|
|
48
|
+
D = torch.diag(A.sum(dim=-1))
|
|
49
|
+
return D - A
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def compute_normalized_laplacian(A: Tensor) -> Tensor:
|
|
53
|
+
"""
|
|
54
|
+
Compute the symmetric-normalized graph Laplacian:
|
|
55
|
+
L_norm = D^{-1/2} (D - A) D^{-1/2}
|
|
56
|
+
|
|
57
|
+
Eigenvalues are in [0, 2], which gives a stable diffusion range
|
|
58
|
+
eta in (0, 0.5). Isolated nodes (degree = 0) are handled safely
|
|
59
|
+
by setting their inverse-sqrt degree to 0.
|
|
60
|
+
|
|
61
|
+
Works with both dense and sparse_coo adjacency matrices; always
|
|
62
|
+
returns a dense Laplacian (used for eigendecomposition / precompute).
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
A: (N, N) adjacency / similarity matrix (dense or sparse_coo).
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
L_norm: (N, N) dense normalized graph Laplacian.
|
|
69
|
+
"""
|
|
70
|
+
A_dense = A.to_dense() if _is_sparse(A) else A
|
|
71
|
+
deg = A_dense.sum(dim=-1) # (N,)
|
|
72
|
+
d_inv_sqrt = deg.pow(-0.5)
|
|
73
|
+
d_inv_sqrt[torch.isinf(d_inv_sqrt)] = 0.0 # isolated nodes -> 0
|
|
74
|
+
D_inv_sqrt = torch.diag(d_inv_sqrt) # (N, N)
|
|
75
|
+
L = torch.diag(deg) - A_dense
|
|
76
|
+
return D_inv_sqrt @ L @ D_inv_sqrt
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Laplacian builder for Graph-Regularized Hopfield attention.
|
|
3
|
+
|
|
4
|
+
Responsibility: Compute the graph Laplacian L from adjacency W.
|
|
5
|
+
This is the *only* class that calls ``compute_laplacian`` or
|
|
6
|
+
``compute_normalized_laplacian``; it is used exclusively when an explicit
|
|
7
|
+
Laplacian is required — i.e. for ``SpectralDiffusion`` (eigendecomposition)
|
|
8
|
+
and energy tracking (``EnergyTracker``).
|
|
9
|
+
|
|
10
|
+
For ``FactoredDiffusion`` the Laplacian is never formed; that mode uses
|
|
11
|
+
(W, deg) directly, so ``LaplacianBuilder`` is not needed there.
|
|
12
|
+
|
|
13
|
+
Usage::
|
|
14
|
+
|
|
15
|
+
builder = LaplacianBuilder(normalized=True)
|
|
16
|
+
L = builder.build(W) # W: (N, N) dense or sparse
|
|
17
|
+
|
|
18
|
+
Complexity:
|
|
19
|
+
normalized=False : O(N²) time, O(N²) space
|
|
20
|
+
normalized=True : O(N²) time, O(N²) space (D^{-1/2} L D^{-1/2})
|
|
21
|
+
|
|
22
|
+
Memory: O(N²) — always returns a dense Laplacian
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from torch import Tensor
|
|
28
|
+
|
|
29
|
+
from .laplacian import compute_laplacian, compute_normalized_laplacian
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LaplacianBuilder:
|
|
33
|
+
"""
|
|
34
|
+
Computes the graph Laplacian L from adjacency W.
|
|
35
|
+
|
|
36
|
+
Two variants:
|
|
37
|
+
* ``normalized=False`` — unnormalized L = D - A.
|
|
38
|
+
Eigenvalues in ``[0, max_degree]``.
|
|
39
|
+
* ``normalized=True`` — symmetric-normalized L_norm = D^{-1/2}(D-A)D^{-1/2}.
|
|
40
|
+
Eigenvalues in ``[0, 2]``; stable for η ∈ (0, 0.5).
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
normalized: If True (default), produce the symmetric-normalized
|
|
44
|
+
Laplacian. Recommended for diffusion stability.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, normalized: bool = True) -> None:
|
|
48
|
+
self.normalized = normalized
|
|
49
|
+
|
|
50
|
+
def build(self, W: Tensor) -> Tensor:
|
|
51
|
+
"""
|
|
52
|
+
Compute the graph Laplacian from adjacency W.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
W: (N, N) adjacency matrix — dense or sparse_coo.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
L: (N, N) dense graph Laplacian (float32).
|
|
59
|
+
|
|
60
|
+
Complexity: O(N²) time, O(N²) space.
|
|
61
|
+
"""
|
|
62
|
+
if self.normalized:
|
|
63
|
+
return compute_normalized_laplacian(W)
|
|
64
|
+
return compute_laplacian(W)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from torch import Tensor
|
|
6
|
+
from torch.nn.modules import Module
|
|
7
|
+
from typing import Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
from . import Hopfield
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HopfieldEncoderLayer(Module):
|
|
13
|
+
"""
|
|
14
|
+
Module with underlying Hopfield association to be used as an encoder in transformer-like architectures.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self,
|
|
18
|
+
hopfield_association: Hopfield,
|
|
19
|
+
dim_feedforward: int = 2048,
|
|
20
|
+
dropout: float = 0.1,
|
|
21
|
+
activation: str = r'relu'
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialise a new instance of a Hopfield association-based encoder module.
|
|
25
|
+
|
|
26
|
+
:param hopfield_association: instance of Hopfield association module
|
|
27
|
+
:param dim_feedforward: depth of the linear projections applied internally
|
|
28
|
+
:param activation: activation to be applied on the result of the internal linear projections
|
|
29
|
+
:param dropout: dropout probability to be applied internally
|
|
30
|
+
"""
|
|
31
|
+
super(HopfieldEncoderLayer, self).__init__()
|
|
32
|
+
self.hopfield_association = deepcopy(hopfield_association)
|
|
33
|
+
|
|
34
|
+
self.linear_residual = nn.Linear(self.hopfield_association.state_pattern_dim, dim_feedforward)
|
|
35
|
+
self.dropout_residual = nn.Dropout(dropout)
|
|
36
|
+
self.linear_output = nn.Linear(dim_feedforward, self.hopfield_association.state_pattern_dim)
|
|
37
|
+
|
|
38
|
+
self.norm_residual = nn.LayerNorm(self.hopfield_association.state_pattern_dim)
|
|
39
|
+
self.norm_output = nn.LayerNorm(self.hopfield_association.state_pattern_dim)
|
|
40
|
+
self.dropout_hopfield_association = nn.Dropout(dropout)
|
|
41
|
+
self.dropout_output = nn.Dropout(dropout)
|
|
42
|
+
|
|
43
|
+
self.activation_residual = getattr(torch, activation, None)
|
|
44
|
+
assert self.activation_residual is not None, r'invalid activation function supplied.'
|
|
45
|
+
self.reset_parameters()
|
|
46
|
+
|
|
47
|
+
def reset_parameters(self) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Reset parameters, including Hopfield association.
|
|
50
|
+
|
|
51
|
+
:return: None
|
|
52
|
+
"""
|
|
53
|
+
for module in (self.hopfield_association, self.linear_residual,
|
|
54
|
+
self.linear_output, self.norm_residual, self.norm_output):
|
|
55
|
+
if hasattr(module, r'reset_parameters'):
|
|
56
|
+
module.reset_parameters()
|
|
57
|
+
|
|
58
|
+
def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
|
|
59
|
+
src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
|
|
60
|
+
"""
|
|
61
|
+
Apply Hopfield encoding on specified data.
|
|
62
|
+
|
|
63
|
+
:param src: data to be processed by Hopfield encoder module
|
|
64
|
+
:param src_mask: mask to be applied on association matrix
|
|
65
|
+
:param src_key_padding_mask: mask to be applied on stored patterns
|
|
66
|
+
:return: Hopfield-encoded input data
|
|
67
|
+
"""
|
|
68
|
+
data_associated = self.hopfield_association(
|
|
69
|
+
input=src, stored_pattern_padding_mask=src_key_padding_mask, association_mask=src_mask)
|
|
70
|
+
src = src + self.dropout_hopfield_association(input=data_associated)
|
|
71
|
+
src = self.norm_residual(input=src)
|
|
72
|
+
|
|
73
|
+
result_residual_inner = self.activation_residual(input=self.linear_residual(input=src))
|
|
74
|
+
data_associated = self.linear_output(input=self.dropout_residual(input=result_residual_inner))
|
|
75
|
+
src = src + self.dropout_output(input=data_associated)
|
|
76
|
+
|
|
77
|
+
return self.norm_output(input=src)
|
|
78
|
+
|
|
79
|
+
def get_association_matrix(self, input: Union[Tensor, Tuple[Tensor, Tensor, Tensor]]) -> Tensor:
|
|
80
|
+
"""
|
|
81
|
+
Fetch Hopfield association matrix gathered by passing through the specified data.
|
|
82
|
+
|
|
83
|
+
:param input: data to be passed through the Hopfield association
|
|
84
|
+
:return: association matrix as computed by the Hopfield core module
|
|
85
|
+
"""
|
|
86
|
+
return self.hopfield_association.get_association_matrix(input=input)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def batch_first(self) -> int:
|
|
90
|
+
return self.hopfield_association.batch_first
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def input_size(self) -> int:
|
|
94
|
+
return self.hopfield_association.input_size
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def output_size(self) -> int:
|
|
98
|
+
return self.linear_output.out_features
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class HopfieldDecoderLayer(Module):
|
|
102
|
+
"""
|
|
103
|
+
Module with underlying Hopfield associations to be used as a decoder in transformer-like architectures.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self,
|
|
107
|
+
hopfield_association_self: Hopfield,
|
|
108
|
+
hopfield_association_cross: Hopfield,
|
|
109
|
+
dim_feedforward: int = 2048,
|
|
110
|
+
dropout: float = 0.1,
|
|
111
|
+
activation: str = r'relu'
|
|
112
|
+
):
|
|
113
|
+
"""
|
|
114
|
+
Initialise a new instance of a Hopfield association-based encoder module.
|
|
115
|
+
|
|
116
|
+
:param hopfield_association_self: instance of Hopfield self-association module
|
|
117
|
+
:param hopfield_association_cross: instance of Hopfield cross-association module
|
|
118
|
+
:param dim_feedforward: depth of the linear projections applied internally
|
|
119
|
+
:param dropout: dropout probability to be applied internally
|
|
120
|
+
:param activation: activation to be applied on the result of the internal linear projections
|
|
121
|
+
"""
|
|
122
|
+
super(HopfieldDecoderLayer, self).__init__()
|
|
123
|
+
self.hopfield_association_self = deepcopy(hopfield_association_self)
|
|
124
|
+
self.hopfield_association_cross = deepcopy(hopfield_association_cross)
|
|
125
|
+
|
|
126
|
+
self.linear_residual = nn.Linear(self.hopfield_association_self.state_pattern_dim, dim_feedforward)
|
|
127
|
+
self.dropout_residual = nn.Dropout(dropout)
|
|
128
|
+
self.linear_output = nn.Linear(dim_feedforward, self.hopfield_association_self.state_pattern_dim)
|
|
129
|
+
|
|
130
|
+
self.norm_residual_self = nn.LayerNorm(self.hopfield_association_self.state_pattern_dim)
|
|
131
|
+
self.norm_residual_cross = nn.LayerNorm(self.hopfield_association_self.state_pattern_dim)
|
|
132
|
+
self.norm_output = nn.LayerNorm(self.hopfield_association_self.state_pattern_dim)
|
|
133
|
+
self.dropout_hopfield_association_self = nn.Dropout(dropout)
|
|
134
|
+
self.dropout_hopfield_association_cross = nn.Dropout(dropout)
|
|
135
|
+
self.dropout_output = nn.Dropout(dropout)
|
|
136
|
+
|
|
137
|
+
self.activation_residual = getattr(torch, activation, None)
|
|
138
|
+
assert self.activation_residual is not None, r'invalid activation function supplied.'
|
|
139
|
+
self.reset_parameters()
|
|
140
|
+
|
|
141
|
+
def reset_parameters(self) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Reset parameters, including Hopfield association.
|
|
144
|
+
|
|
145
|
+
:return: None
|
|
146
|
+
"""
|
|
147
|
+
for module in (self.hopfield_association_self, self.hopfield_association_cross,
|
|
148
|
+
self.linear_residual, self.linear_output, self.norm_residual_self,
|
|
149
|
+
self.norm_residual_cross, self.norm_output):
|
|
150
|
+
if hasattr(module, r'reset_parameters'):
|
|
151
|
+
module.reset_parameters()
|
|
152
|
+
|
|
153
|
+
def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
|
|
154
|
+
memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
|
|
155
|
+
memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
|
|
156
|
+
"""
|
|
157
|
+
Apply Hopfield decoding on specified data.
|
|
158
|
+
|
|
159
|
+
:param tgt: data to be processed by Hopfield decoder module (self-association)
|
|
160
|
+
:param memory: data to be processed by Hopfield encoder module (cross-association)
|
|
161
|
+
:param tgt_mask: mask to be applied on self-association matrix
|
|
162
|
+
:param memory_mask: mask to be applied on cross-association matrix
|
|
163
|
+
:param tgt_key_padding_mask: mask to be applied on stored patterns
|
|
164
|
+
:param memory_key_padding_mask: mask to be applied on state patterns as well as pattern projection
|
|
165
|
+
:return: Hopfield-decoded input
|
|
166
|
+
"""
|
|
167
|
+
data_associated = self.hopfield_association_self(
|
|
168
|
+
input=tgt, stored_pattern_padding_mask=tgt_key_padding_mask,
|
|
169
|
+
association_mask=tgt_mask)
|
|
170
|
+
tgt = tgt + self.dropout_hopfield_association_self(input=data_associated)
|
|
171
|
+
tgt = self.norm_residual_self(input=tgt)
|
|
172
|
+
|
|
173
|
+
data_associated = self.hopfield_association_cross(
|
|
174
|
+
input=(memory, tgt, memory), stored_pattern_padding_mask=memory_key_padding_mask,
|
|
175
|
+
association_mask=memory_mask)
|
|
176
|
+
tgt = tgt + self.dropout_hopfield_association_cross(input=data_associated)
|
|
177
|
+
tgt = self.norm_residual_cross(input=tgt)
|
|
178
|
+
|
|
179
|
+
result_residual_inner = self.activation_residual(input=self.linear_residual(input=tgt))
|
|
180
|
+
data_associated = self.linear_output(input=self.dropout_residual(input=result_residual_inner))
|
|
181
|
+
tgt = tgt + self.dropout_output(input=data_associated)
|
|
182
|
+
return self.norm_output(input=tgt)
|
|
183
|
+
|
|
184
|
+
def get_association_matrix_self(self, input: Union[Tensor, Tuple[Tensor, Tensor, Tensor]]) -> Tensor:
|
|
185
|
+
"""
|
|
186
|
+
Fetch Hopfield self-association matrix gathered by passing through the specified data.
|
|
187
|
+
|
|
188
|
+
:param input: data to be passed through the Hopfield association
|
|
189
|
+
:return: association matrix as computed by the Hopfield core module
|
|
190
|
+
"""
|
|
191
|
+
return self.hopfield_association_self.get_association_matrix(input=input)
|
|
192
|
+
|
|
193
|
+
def get_association_matrix_cross(self, input: Union[Tensor, Tuple[Tensor, Tensor, Tensor]]) -> Tensor:
|
|
194
|
+
"""
|
|
195
|
+
Fetch Hopfield cross-association matrix gathered by passing through the specified data.
|
|
196
|
+
|
|
197
|
+
:param input: data to be passed through the Hopfield association
|
|
198
|
+
:return: association matrix as computed by the Hopfield core module
|
|
199
|
+
"""
|
|
200
|
+
return self.hopfield_association_cross.get_association_matrix(input=input)
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def batch_first(self) -> int:
|
|
204
|
+
return self.hopfield_association_self.batch_first
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def input_size(self) -> int:
|
|
208
|
+
return self.hopfield_association_self.input_size
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def output_size(self) -> int:
|
|
212
|
+
return self.linear_output_self.out_features
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: difflayers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: difflayers: Diffusion-Augmented Hopfield Networks
|
|
5
|
+
Home-page: https://github.com/hopfileds/hopfield-layers
|
|
6
|
+
Author: Priyam Ghosh
|
|
7
|
+
Author-email: Priyam Ghosh <priyamghosh9753@gmail.com>
|
|
8
|
+
License: BSD
|
|
9
|
+
Project-URL: Homepage, https://github.com/hopfileds/hopfield-layers
|
|
10
|
+
Project-URL: Repository, https://github.com/hopfileds/hopfield-layers
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/hopfileds/hopfield-layers/issues
|
|
12
|
+
Keywords: hopfield networks,deep learning,attention,diffusion,graph
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Operating System :: OS Independent
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: torch>=1.9.0
|
|
28
|
+
Requires-Dist: numpy>=1.20.0
|
|
29
|
+
Requires-Dist: scipy>=1.7.0
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
Dynamic: requires-python
|
|
34
|
+
|
|
35
|
+
# Hopfield Networks is All You Need
|
|
36
|
+
|
|
37
|
+
_Hubert Ramsauer<sup>1</sup>, Bernhard Schäfl<sup>1</sup>, Johannes Lehner<sup>1</sup>, Philipp Seidl<sup>1</sup>,
|
|
38
|
+
Michael Widrich<sup>1</sup>, Lukas Gruber<sup>1</sup>, Markus Holzleitner<sup>1</sup>, Milena Pavlović<sup>3, 4</sup>,
|
|
39
|
+
Geir Kjetil Sandve<sup>4</sup>, Victor Greiff<sup>3</sup>, David Kreil<sup>2</sup>, Michael Kopp<sup>2</sup>, Günter
|
|
40
|
+
Klambauer<sup>1</sup>, Johannes Brandstetter<sup>1</sup>, Sepp Hochreiter<sup>1, 2</sup>_
|
|
41
|
+
|
|
42
|
+
<sup>1</sup> ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria
|
|
43
|
+
<sup>2</sup> Institute of Advanced Research in Artificial Intelligence (IARAI)
|
|
44
|
+
<sup>3</sup> Department of Immunology, University of Oslo, Norway
|
|
45
|
+
<sup>4</sup> Department of Informatics, University of Oslo, Norway
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
##### Detailed blog post on this paper as well as the necessary background on Hopfield networks at [this link](https://ml-jku.github.io/hopfield-layers/).
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
The transformer and BERT models pushed the performance on NLP tasks to new levels via their attention mechanism. We show
|
|
54
|
+
that this attention mechanism is the update rule of a modern Hopfield network with continuous states. This new Hopfield
|
|
55
|
+
network can store exponentially (with the dimension) many patterns,converges with one update, and has exponentially
|
|
56
|
+
small retrieval errors. The number of stored patterns must be traded off against convergence speed and retrieval error.
|
|
57
|
+
The new Hopfield network has three types of energy minima (fixed points of the update):
|
|
58
|
+
|
|
59
|
+
1. global fixed point averaging over all patterns,
|
|
60
|
+
2. metastable states averaging over a subset of patterns, and
|
|
61
|
+
3. fixed points which store a single pattern.
|
|
62
|
+
|
|
63
|
+
Transformers learn an attention mechanism by constructing an embedding of patterns and queries into an associative
|
|
64
|
+
space. Transformer and BERT models operate in their first layers preferably in the global averaging regime, while they
|
|
65
|
+
operate in higher layers in metastable states. The gradient in transformers is maximal in the regime of metastable
|
|
66
|
+
states, is uniformly distributed when averaging globally, and vanishes when a fixed point is near a stored pattern.
|
|
67
|
+
Based on the Hopfield network interpretation, we analyzed learning of transformer and BERT architectures. Learning
|
|
68
|
+
starts with attention heads that average and then most of them switch to metastable states. However, the majority of
|
|
69
|
+
heads in the first layers still averages and can be replaced by averaging operations like the Gaussian weighting that we
|
|
70
|
+
propose. In contrast, heads in the last layers steadily learn and seem to use metastable states to collect information
|
|
71
|
+
created in lower layers. These heads seem a promising target for improving transformers. Neural networks that integrate
|
|
72
|
+
Hopfield networks that are equivalent to attention heads outperform other methods on immune repertoire classification,
|
|
73
|
+
where the Hopfield net stores several hundreds of thousands of patterns.
|
|
74
|
+
|
|
75
|
+
With _this_ repository, we provide a PyTorch implementation of a new layer called “Hopfield” which allows to equip deep
|
|
76
|
+
learning architectures with Hopfield networks as new memory concepts.
|
|
77
|
+
|
|
78
|
+
The full paper is available at [https://arxiv.org/abs/2008.02217](https://arxiv.org/abs/2008.02217).
|
|
79
|
+
|
|
80
|
+
## Requirements
|
|
81
|
+
|
|
82
|
+
The software was developed and tested on the following 64-bit operating systems:
|
|
83
|
+
|
|
84
|
+
- CentOS Linux release 8.1.1911 (Core)
|
|
85
|
+
- macOS 10.15.5 (Catalina)
|
|
86
|
+
|
|
87
|
+
As the development environment, [Python](https://www.python.org) 3.8.3 in combination
|
|
88
|
+
with [PyTorch](https://pytorch.org) 1.6.0 was used (a version of at least 1.5.0 should be sufficient). More details on
|
|
89
|
+
how to install PyTorch are available on the [official project page](https://pytorch.org).
|
|
90
|
+
|
|
91
|
+
## Installation
|
|
92
|
+
|
|
93
|
+
The recommended way to install the software is to use `pip/pip3`:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
$ pip3 install git+https://github.com/ml-jku/hopfield-layers
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
To successfully run the [Jupyter notebooks](https://jupyter.org) contained in [examples](examples/), additional
|
|
100
|
+
third-party modules are needed:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
$ pip3 install -r examples/requirements.txt
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The installation of the [Jupyter software](https://jupyter.org/install.html) itself is not covered. More details on how
|
|
107
|
+
to install Jupyter are available at the [official installation page](https://jupyter.org/install.html).
|
|
108
|
+
|
|
109
|
+
## Usage
|
|
110
|
+
|
|
111
|
+
To get up and running with Hopfield-based networks, only <i>one</i> argument needs to be set, the size (depth) of the
|
|
112
|
+
input.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from hflayers import Hopfield
|
|
116
|
+
|
|
117
|
+
hopfield = Hopfield(input_size=...)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
It is also possible to replace commonly used pooling functions with a Hopfield-based one. Internally, a <i>state
|
|
121
|
+
pattern</i> is trained, which in turn is used to compute pooling weights with respect to the input.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from hflayers import HopfieldPooling
|
|
125
|
+
|
|
126
|
+
hopfield_pooling = HopfieldPooling(input_size=...)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
A second variant of our Hopfield-based modules is one which employs a trainable but fixed lookup mechanism. Internally,
|
|
130
|
+
one or multiple <i>stored patterns</i> and <i>pattern projections</i> are trained (optionally in a non-shared manner),
|
|
131
|
+
which in turn are used as a lookup mechanism independent of the input data.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from hflayers import HopfieldLayer
|
|
135
|
+
|
|
136
|
+
hopfield_lookup = HopfieldLayer(input_size=...)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
The usage is as <i>simple</i> as with the main module, but equally <i>powerful</i>.
|
|
140
|
+
|
|
141
|
+
## Examples
|
|
142
|
+
|
|
143
|
+
Generally, the Hopfield layer is designed to be used to implement or to substitute different layers like:
|
|
144
|
+
|
|
145
|
+
- <b>Pooling layers:</b> We consider the Hopfield layer as a pooling layer if only one static state (query) pattern
|
|
146
|
+
exists. Then, it is de facto a pooling over the sequence, which results from the softmax values applied on the stored
|
|
147
|
+
patterns. Therefore, our Hopfield layer can act as a pooling layer.
|
|
148
|
+
|
|
149
|
+
- <b>Permutation equivariant layers:</b> Our Hopfield layer can be used as a plug-in replacement for permutation
|
|
150
|
+
equivariant layers. Since the Hopfield layer is an associative memory it assumes no dependency between the input
|
|
151
|
+
patterns.
|
|
152
|
+
|
|
153
|
+
- <b>GRU & LSTM layers:</b> Our Hopfield layer can be used as a plug-in replacement for GRU & LSTM layers. Optionally,
|
|
154
|
+
for substituting GRU & LSTM layers, positional encoding might be considered.
|
|
155
|
+
|
|
156
|
+
- <b>Attention layers:</b> Our Hopfield layer can act as an attention layer, where state (query) and stored (key)
|
|
157
|
+
patterns are different, and need to be associated.
|
|
158
|
+
|
|
159
|
+
The folder [examples](examples/) contains multiple demonstrations on how to use the <code>Hopfield</code>, <code>
|
|
160
|
+
HopfieldPooling</code> as well as the <code>HopfieldLayer</code> modules. To successfully run the
|
|
161
|
+
contained [Jupyter notebooks](https://jupyter.org), additional third-party modules
|
|
162
|
+
like [pandas](https://pandas.pydata.org) and [seaborn](https://seaborn.pydata.org) are required.
|
|
163
|
+
|
|
164
|
+
- [Bit Pattern Set](examples/bit_pattern/bit_pattern_demo.ipynb): The dataset of this demonstration falls into the
|
|
165
|
+
category of <i>binary classification</i> tasks in the domain of <i>Multiple Instance Learning (MIL)</i> problems. Each
|
|
166
|
+
bag comprises a collection of bit pattern instances, wheres each instance is a sequence of <b>0s</b> and <b>1s</b>.
|
|
167
|
+
The positive class has specific bit patterns injected, which are absent in the negative one. This demonstration shows,
|
|
168
|
+
that <code>Hopfield</code>, <code>HopfieldPooling</code> and <code>HopfieldLayer</code> are capable of learning and
|
|
169
|
+
filtering each bag with respect to the class-defining bit patterns.
|
|
170
|
+
|
|
171
|
+
- [Latch Sequence Set](examples/latch_sequence/latch_sequence_demo.ipynb): We study an easy example of learning
|
|
172
|
+
long-term dependencies by using a simple <i>latch task</i>,
|
|
173
|
+
see [Hochreiter and Mozer](https://link.springer.com/chapter/10.1007/3-540-44668-0_92). The essence of this task is
|
|
174
|
+
that a sequence of inputs is presented, beginning with one of two symbols, <b>A</b> or <b>B</b>, and after a variable
|
|
175
|
+
number of time steps, the model has to output a corresponding symbol. Thus, the task requires memorizing the original
|
|
176
|
+
input over time. It has to be noted, that both class-defining symbols must only appear at the first position of a
|
|
177
|
+
sequence. This task was specifically designed to demonstrate the capability of recurrent neural networks to capture
|
|
178
|
+
long term dependencies. This demonstration shows, that <code>Hopfield</code>, <code>HopfieldPooling</code> and <code>
|
|
179
|
+
HopfieldLayer</code> adapt extremely fast to this specific task, concentrating only on the first entry of the
|
|
180
|
+
sequence.
|
|
181
|
+
|
|
182
|
+
- [Attention-based Deep Multiple Instance Learning](examples/mnist_bags/mnist_bags_demo.ipynb): The dataset of this
|
|
183
|
+
demonstration falls into the category of <i>binary classification</i> tasks in the domain of <i>Multiple Instance
|
|
184
|
+
Learning (MIL)</i> problems, see [Ilse and Tomczak](https://arxiv.org/abs/1802.04712). Each bag comprises a collection
|
|
185
|
+
of <b>28x28</b> grayscale images/instances, whereas each instance is a sequence of pixel values in the range
|
|
186
|
+
of <b>[0; 255]</b>. The amount of instances per pag is drawn from a Gaussian with specified mean and variance. The
|
|
187
|
+
positive class is defined by the presence of the target number/digit, whereas the negative one by its absence.
|
|
188
|
+
|
|
189
|
+
## Disclaimer
|
|
190
|
+
|
|
191
|
+
Some implementations of this repository are based on existing ones of the
|
|
192
|
+
official [PyTorch repository v1.6.0](https://github.com/pytorch/pytorch/tree/v1.6.0) and accordingly extended and
|
|
193
|
+
modified. In the following, the involved parts are listed:
|
|
194
|
+
|
|
195
|
+
- The implementation of [HopfieldCore](hflayers/activation.py#L16) is based on the implementation
|
|
196
|
+
of [MultiheadAttention](https://github.com/pytorch/pytorch/blob/b31f58de6fa8bbda5353b3c77d9be4914399724d/torch/nn/modules/activation.py#L771)
|
|
197
|
+
.
|
|
198
|
+
- The implementation of [hopfield_core_forward](hflayers/functional.py#L8) is based on the implementation
|
|
199
|
+
of [multi_head_attention_forward](https://github.com/pytorch/pytorch/blob/b31f58de6fa8bbda5353b3c77d9be4914399724d/torch/nn/functional.py#L3854)
|
|
200
|
+
.
|
|
201
|
+
- The implementation of [HopfieldEncoderLayer](hflayers/transformer.py#L12) is based on the implementation
|
|
202
|
+
of [TransformerEncoderLayer](https://github.com/pytorch/pytorch/blob/b31f58de6fa8bbda5353b3c77d9be4914399724d/torch/nn/modules/transformer.py#L241)
|
|
203
|
+
.
|
|
204
|
+
- The implementation of [HopfieldDecoderLayer](hflayers/transformer.py#L101) is based on the implementation
|
|
205
|
+
of [TransformerDecoderLayer](https://github.com/pytorch/pytorch/blob/b31f58de6fa8bbda5353b3c77d9be4914399724d/torch/nn/modules/transformer.py#L303)
|
|
206
|
+
.
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
This repository is BSD-style licensed (see [LICENSE](LICENSE)), except where noted otherwise.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
difflayers/__init__.py,sha256=vpmvaQa-IOOUieCJHc8GYtDHYIhyQKhhmG0JzCynACs,47270
|
|
2
|
+
difflayers/activation.py,sha256=gl23vFjb_7cjTolftGZfSwwBOlwRZHOrpz89a7kLBHk,19207
|
|
3
|
+
difflayers/attention_operator.py,sha256=sqstQXzVZZkjloB_0NWGxRO_H0fItIlWwtaZCW3Q1L4,5743
|
|
4
|
+
difflayers/diffused_attention.py,sha256=-JvUHzYTMrdke8Q4TWRFR8vgvEaeuOa1LfOiF2Pv3GQ,19264
|
|
5
|
+
difflayers/diffusion.py,sha256=eWsNJOrdphjGYQbJ6SD8B9px3ns6c1xN8Dq58V9LqlU,13656
|
|
6
|
+
difflayers/dynamics_engine.py,sha256=hgiej34ETREmqDhf5xfW1pVD5bhcgFadZYGVkyW5peA,20287
|
|
7
|
+
difflayers/functional.py,sha256=aiwPffKCuTmBO7HE9Ezom4xxym4OILz-MQvDvH10otA,26693
|
|
8
|
+
difflayers/transformer.py,sha256=IPDQK6Qvu7EpLAjigk5e2TU5NT74Lud77OlmarKwtfA,9727
|
|
9
|
+
difflayers/auxiliary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
difflayers/auxiliary/data.py,sha256=YcsJIjIJz3CJuXba4nQIpA5rLVGZkcW5kSzyhpSJTh4,10392
|
|
11
|
+
difflayers/graph/__init__.py,sha256=EFGF-v4_NxJ1CrU3P1G-iZle9Xz9iY-UakdugPEAYp0,642
|
|
12
|
+
difflayers/graph/build_graph.py,sha256=1Tubtd3SMPLMHSuW8nnf4JQUjtND-mPyjCkkQpTrBj8,2555
|
|
13
|
+
difflayers/graph/builder.py,sha256=WeX2xwUALIRST3mliG2pnSkl44Fri5a6QANMLQZUzfU,4646
|
|
14
|
+
difflayers/graph/laplacian.py,sha256=k5cDHur9J881PAoBa3Hmtj4WjmSCIUWJ9ALUlxMIAu4,2528
|
|
15
|
+
difflayers/graph/laplacian_builder.py,sha256=7DzOfiqFWZu5vMwHfREktY5_G3743NyvjwnpwfjLQhs,2013
|
|
16
|
+
difflayers-0.1.0.dist-info/licenses/LICENSE,sha256=AjYSib8nN4uc727CcxTOJNX6ehJIHD8p8Ph0nX01Ifs,3387
|
|
17
|
+
difflayers-0.1.0.dist-info/METADATA,sha256=a5WWZHoSdjlDEjsLxcq8loDcH1wIWx2xM-Pi43cCKpk,11895
|
|
18
|
+
difflayers-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
19
|
+
difflayers-0.1.0.dist-info/top_level.txt,sha256=HRiuZ0X4NWI1tQEmeMlDqbRK17xpGTyNPnYqDezMias,11
|
|
20
|
+
difflayers-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
From Hopfield layers:
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2020, Institute for Machine Learning, Johannes Kepler University Linz (Bernhard Schäfl)
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
All other contributions:
|
|
7
|
+
Copyright (c) 2020 the respective contributors
|
|
8
|
+
All rights reserved.
|
|
9
|
+
|
|
10
|
+
From PyTorch:
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
|
13
|
+
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
|
14
|
+
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|
15
|
+
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
|
16
|
+
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
|
17
|
+
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
|
18
|
+
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
|
19
|
+
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
|
20
|
+
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
|
21
|
+
|
|
22
|
+
From Caffe2:
|
|
23
|
+
|
|
24
|
+
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
|
25
|
+
|
|
26
|
+
All contributions by Facebook:
|
|
27
|
+
Copyright (c) 2016 Facebook Inc.
|
|
28
|
+
|
|
29
|
+
All contributions by Google:
|
|
30
|
+
Copyright (c) 2015 Google Inc.
|
|
31
|
+
All rights reserved.
|
|
32
|
+
|
|
33
|
+
All contributions by Yangqing Jia:
|
|
34
|
+
Copyright (c) 2015 Yangqing Jia
|
|
35
|
+
All rights reserved.
|
|
36
|
+
|
|
37
|
+
All contributions from Caffe:
|
|
38
|
+
Copyright(c) 2013, 2014, 2015, the respective contributors
|
|
39
|
+
All rights reserved.
|
|
40
|
+
|
|
41
|
+
All other contributions:
|
|
42
|
+
Copyright(c) 2015, 2016 the respective contributors
|
|
43
|
+
All rights reserved.
|
|
44
|
+
|
|
45
|
+
Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
|
46
|
+
copyright over their contributions to Caffe2. The project versioning records
|
|
47
|
+
all such contribution and copyright details. If a contributor wants to further
|
|
48
|
+
mark their specific copyright on a particular contribution, they should
|
|
49
|
+
indicate their copyright solely in the commit message of the change when it is
|
|
50
|
+
committed.
|
|
51
|
+
|
|
52
|
+
All rights reserved.
|
|
53
|
+
|
|
54
|
+
Redistribution and use in source and binary forms, with or without
|
|
55
|
+
modification, are permitted provided that the following conditions are met:
|
|
56
|
+
|
|
57
|
+
1. Redistributions of source code must retain the above copyright
|
|
58
|
+
notice, this list of conditions and the following disclaimer.
|
|
59
|
+
|
|
60
|
+
2. Redistributions in binary form must reproduce the above copyright
|
|
61
|
+
notice, this list of conditions and the following disclaimer in the
|
|
62
|
+
documentation and/or other materials provided with the distribution.
|
|
63
|
+
|
|
64
|
+
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
|
65
|
+
and IDIAP Research Institute nor the names of its contributors may be
|
|
66
|
+
used to endorse or promote products derived from this software without
|
|
67
|
+
specific prior written permission.
|
|
68
|
+
|
|
69
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
70
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
71
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
72
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
73
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
74
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
75
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
76
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
77
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
78
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
79
|
+
POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
difflayers
|