morphottention 0.2.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morphottention/_C.cp313-win_amd64.pyd +0 -0
- morphottention/_C.pyi +27 -0
- morphottention/__init__.py +3 -0
- morphottention/autograd.py +141 -0
- morphottention/py.typed +0 -0
- morphottention-0.2.0.dist-info/METADATA +130 -0
- morphottention-0.2.0.dist-info/RECORD +8 -0
- morphottention-0.2.0.dist-info/WHEEL +5 -0
|
Binary file
|
morphottention/_C.pyi
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
def forward(
|
|
4
|
+
X: torch.Tensor,
|
|
5
|
+
W_phi: torch.Tensor,
|
|
6
|
+
gate_q: torch.Tensor,
|
|
7
|
+
gate_k: torch.Tensor,
|
|
8
|
+
W_V: torch.Tensor,
|
|
9
|
+
H: int,
|
|
10
|
+
cube_m: int,
|
|
11
|
+
scale: float,
|
|
12
|
+
causal: bool,
|
|
13
|
+
) -> list[torch.Tensor]: ...
|
|
14
|
+
def backward(
|
|
15
|
+
grad_out: torch.Tensor,
|
|
16
|
+
X: torch.Tensor,
|
|
17
|
+
W_phi: torch.Tensor,
|
|
18
|
+
gate_q: torch.Tensor,
|
|
19
|
+
gate_k: torch.Tensor,
|
|
20
|
+
W_V: torch.Tensor,
|
|
21
|
+
out: torch.Tensor,
|
|
22
|
+
lse: torch.Tensor,
|
|
23
|
+
H: int,
|
|
24
|
+
cube_m: int,
|
|
25
|
+
scale: float,
|
|
26
|
+
causal: bool,
|
|
27
|
+
) -> list[torch.Tensor]: ...
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Autograd wrapper and nn.Module around the compiled Morphottention CUDA kernels.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
from torch import nn
|
|
9
|
+
|
|
10
|
+
from . import _C
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MorphoAttentionFunction(torch.autograd.Function):
|
|
14
|
+
"""
|
|
15
|
+
Bridges the CUDA attention kernels into autograd.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def forward(
|
|
20
|
+
ctx: torch.autograd.function.FunctionCtx,
|
|
21
|
+
x: torch.Tensor,
|
|
22
|
+
W_phi: torch.Tensor,
|
|
23
|
+
gate_q: torch.Tensor,
|
|
24
|
+
gate_k: torch.Tensor,
|
|
25
|
+
W_V: torch.Tensor,
|
|
26
|
+
H: int,
|
|
27
|
+
cube_m: int,
|
|
28
|
+
scale: float,
|
|
29
|
+
causal: bool,
|
|
30
|
+
) -> torch.Tensor:
|
|
31
|
+
if not x.is_cuda:
|
|
32
|
+
raise ValueError("MorphoAttention expects a CUDA tensor")
|
|
33
|
+
|
|
34
|
+
x = x.contiguous()
|
|
35
|
+
out, lse = _C.forward(x, W_phi, gate_q, gate_k, W_V, H, cube_m, scale, causal)
|
|
36
|
+
|
|
37
|
+
ctx.save_for_backward(x, W_phi, gate_q, gate_k, W_V, out, lse)
|
|
38
|
+
ctx.H = H # type: ignore[attr-defined]
|
|
39
|
+
ctx.cube_m = cube_m # type: ignore[attr-defined]
|
|
40
|
+
ctx.scale = scale # type: ignore[attr-defined]
|
|
41
|
+
ctx.causal = causal # type: ignore[attr-defined]
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def backward(
|
|
46
|
+
ctx: torch.autograd.function.FunctionCtx,
|
|
47
|
+
grad_out: torch.Tensor,
|
|
48
|
+
) -> tuple[torch.Tensor | None, ...]:
|
|
49
|
+
x, W_phi, gate_q, gate_k, W_V, out, lse = ctx.saved_tensors # type: ignore[attr-defined]
|
|
50
|
+
|
|
51
|
+
grad_out = grad_out.contiguous()
|
|
52
|
+
dX, dW_phi, d_gate_q, d_gate_k, dW_V = _C.backward(
|
|
53
|
+
grad_out,
|
|
54
|
+
x,
|
|
55
|
+
W_phi,
|
|
56
|
+
gate_q,
|
|
57
|
+
gate_k,
|
|
58
|
+
W_V,
|
|
59
|
+
out,
|
|
60
|
+
lse,
|
|
61
|
+
ctx.H, # type: ignore[attr-defined]
|
|
62
|
+
ctx.cube_m, # type: ignore[attr-defined]
|
|
63
|
+
ctx.scale, # type: ignore[attr-defined]
|
|
64
|
+
ctx.causal, # type: ignore[attr-defined]
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return dX, dW_phi, d_gate_q, d_gate_k, dW_V, None, None, None, None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class MorphoAttention(nn.Module):
|
|
71
|
+
"""
|
|
72
|
+
Morphological hypercube attention
|
|
73
|
+
:param dim: model dimension ``D`` (must be divisible by ``num_heads``).
|
|
74
|
+
:param num_heads: number of attention heads ``H``.
|
|
75
|
+
:param cube_m: hypercube width per head ``m`` (the score-GEMM contraction dim).
|
|
76
|
+
:param scale: softmax temperature.
|
|
77
|
+
:param causal: whether to apply causal masking.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
dim: int,
|
|
83
|
+
num_heads: int,
|
|
84
|
+
cube_m: int,
|
|
85
|
+
*,
|
|
86
|
+
scale: float = 1.0,
|
|
87
|
+
causal: bool = False,
|
|
88
|
+
dtype: torch.dtype = torch.float16,
|
|
89
|
+
device: torch.device | str | None = None,
|
|
90
|
+
) -> None:
|
|
91
|
+
super().__init__()
|
|
92
|
+
if dim % num_heads != 0:
|
|
93
|
+
raise ValueError(f"dim ({dim}) must be divisible by num_heads ({num_heads})")
|
|
94
|
+
|
|
95
|
+
self.dim = dim
|
|
96
|
+
self.num_heads = num_heads
|
|
97
|
+
self.cube_m = cube_m
|
|
98
|
+
self.scale = scale
|
|
99
|
+
self.causal = causal
|
|
100
|
+
|
|
101
|
+
head_dim_v = dim // num_heads
|
|
102
|
+
self.W_phi = nn.Parameter(torch.empty(dim, num_heads * cube_m, dtype=dtype, device=device))
|
|
103
|
+
self.gate_q = nn.Parameter(torch.empty(num_heads, cube_m, dtype=dtype, device=device))
|
|
104
|
+
self.gate_k = nn.Parameter(torch.empty(num_heads, cube_m, dtype=dtype, device=device))
|
|
105
|
+
self.W_V = nn.Parameter(torch.empty(dim, num_heads * head_dim_v, dtype=dtype, device=device))
|
|
106
|
+
self.reset_parameters()
|
|
107
|
+
|
|
108
|
+
def reset_parameters(self) -> None:
|
|
109
|
+
std = self.dim**-0.5
|
|
110
|
+
with torch.no_grad():
|
|
111
|
+
for w in (self.W_phi, self.W_V):
|
|
112
|
+
w.copy_(torch.empty(w.shape, dtype=torch.float32, device=w.device).normal_(0.0, std))
|
|
113
|
+
self.gate_q.fill_(1.0)
|
|
114
|
+
self.gate_k.fill_(1.0)
|
|
115
|
+
|
|
116
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
117
|
+
return MorphoAttentionFunction.apply( # type: ignore[no-any-return, no-untyped-call]
|
|
118
|
+
x, self.W_phi, self.gate_q, self.gate_k, self.W_V, self.num_heads, self.cube_m, self.scale, self.causal
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def morpho_attention(
|
|
123
|
+
x: torch.Tensor,
|
|
124
|
+
W_phi: torch.Tensor,
|
|
125
|
+
gate_q: torch.Tensor,
|
|
126
|
+
gate_k: torch.Tensor,
|
|
127
|
+
W_V: torch.Tensor,
|
|
128
|
+
num_heads: int,
|
|
129
|
+
cube_m: int,
|
|
130
|
+
*,
|
|
131
|
+
scale: float = 1.0,
|
|
132
|
+
causal: bool = False,
|
|
133
|
+
) -> torch.Tensor:
|
|
134
|
+
"""
|
|
135
|
+
Apply the Morphottention attention mechanism with autograd support.
|
|
136
|
+
:param: x: input activations of shape (B, N, D) on a CUDA device.
|
|
137
|
+
:returns: the attention output, differentiable w.r.t. ``x``.
|
|
138
|
+
"""
|
|
139
|
+
return MorphoAttentionFunction.apply( # type: ignore[no-any-return, no-untyped-call]
|
|
140
|
+
x, W_phi, gate_q, gate_k, W_V, num_heads, cube_m, scale, causal
|
|
141
|
+
)
|
morphottention/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: morphottention
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Mathematical Morphology-based self-attention module for PyTorch (CUDA) using Flash-style kernel fusion.
|
|
5
|
+
Keywords: attention,cuda,pytorch,transformer,morphology,flash-attention,ViT
|
|
6
|
+
Author-Email: Vedran Hrabar <vedran.hrabar@outlook.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Environment :: GPU
|
|
10
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
11
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA :: 13
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
17
|
+
Classifier: Programming Language :: C++
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
28
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
29
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
30
|
+
Classifier: Typing :: Typed
|
|
31
|
+
Project-URL: repository, https://github.com/vhrabar/morphottention
|
|
32
|
+
Project-URL: documentation, https://github.com/vhrabar/morphottention/wiki
|
|
33
|
+
Project-URL: Bug Tracker, https://github.com/vhrabar/morphottention/issues
|
|
34
|
+
Requires-Python: >=3.12
|
|
35
|
+
Requires-Dist: torch>=2.12
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# Morphottention
|
|
39
|
+
Mathematical Morphology-based self-attention module for PyTorch using Flash-style kernel fusion.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
Prebuilt wheels are published for CPython 3.14 on Linux (x86_64, aarch64) and
|
|
44
|
+
Windows (x86_64). A working CUDA-enabled PyTorch (`torch >= 2.12`) must already
|
|
45
|
+
be installed in the environment.
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install morphottention
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
The package exposes an `nn.Module` (`MorphoAttention`), a functional entry point
|
|
54
|
+
(`morpho_attention`), and the raw autograd bridge (`MorphoAttentionFunction`).
|
|
55
|
+
All inputs must be CUDA tensors; the module defaults to `float16`.
|
|
56
|
+
|
|
57
|
+
### As an `nn.Module`
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import torch
|
|
61
|
+
from morphottention import MorphoAttention
|
|
62
|
+
|
|
63
|
+
attn = MorphoAttention(
|
|
64
|
+
dim=256, # model dimension D
|
|
65
|
+
num_heads=8, # number of attention heads H
|
|
66
|
+
cube_m=16, # hypercube width per head
|
|
67
|
+
scale=1.0, # softmax temperature
|
|
68
|
+
causal=False, # casual masking flag
|
|
69
|
+
device="cuda",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
x = torch.randn(2, 128, 256, dtype=torch.float16, device="cuda") # (B, N, D)
|
|
73
|
+
out = attn(x) # (B, N, D)
|
|
74
|
+
out.sum().backward()
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Functional form
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from morphottention import morpho_attention
|
|
81
|
+
|
|
82
|
+
out = morpho_attention(
|
|
83
|
+
x,
|
|
84
|
+
W_phi,
|
|
85
|
+
gate_q,
|
|
86
|
+
gate_k,
|
|
87
|
+
W_V,
|
|
88
|
+
num_heads=8, cube_m=16, scale=1.0,
|
|
89
|
+
causal=False,
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Raw autograd bridge
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import torch
|
|
97
|
+
from morphottention import MorphoAttentionFunction
|
|
98
|
+
|
|
99
|
+
B, N, D, H, cube_m = 2, 128, 256, 8, 16
|
|
100
|
+
|
|
101
|
+
x = torch.randn(B, N, D, dtype=torch.float16, device="cuda", requires_grad=True)
|
|
102
|
+
W_phi = torch.randn(D, H * cube_m, dtype=torch.float16, device="cuda", requires_grad=True)
|
|
103
|
+
gate_q = torch.ones(H, cube_m, dtype=torch.float16, device="cuda", requires_grad=True)
|
|
104
|
+
gate_k = torch.ones(H, cube_m, dtype=torch.float16, device="cuda", requires_grad=True)
|
|
105
|
+
W_V = torch.randn(D, D, dtype=torch.float16, device="cuda", requires_grad=True)
|
|
106
|
+
|
|
107
|
+
out = MorphoAttentionFunction.apply(
|
|
108
|
+
x, W_phi, gate_q, gate_k, W_V,
|
|
109
|
+
H, cube_m, 1.0, False, # num_heads, cube_m, scale, causal
|
|
110
|
+
) # (B, N, D)
|
|
111
|
+
out.sum().backward()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
`W_phi` has shape `(D, H * cube_m)`, `W_V` has shape `(D, D)`, and `gate_q` /
|
|
115
|
+
`gate_k` each have shape `(H, cube_m)`.
|
|
116
|
+
|
|
117
|
+
## Building from source
|
|
118
|
+
|
|
119
|
+
Requires the CUDA 13.X toolkit (`nvcc`) and a matching `torch` build:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
uv sync --package morphottention --no-dev --group build
|
|
123
|
+
uv build --package morphottention --wheel --no-build-isolation
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## License
|
|
127
|
+
|
|
128
|
+
MIT
|
|
129
|
+
|
|
130
|
+
Copyright © 2026 Vedran Hrabar.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
morphottention/__init__.py,sha256=geq-PCIo1BkEd77w7_3-vTUKGHBFFHs-CTzT3tlMwGQ,159
|
|
2
|
+
morphottention/_C.cp313-win_amd64.pyd,sha256=UZvNR1ewZ5Zsjd15H6Ry9TCmo-wCTw5dsOKpjspDXZM,786944
|
|
3
|
+
morphottention/_C.pyi,sha256=jFKwfXOxsS7Y9FjwKBGaVDpi2YqwN_x0cP1mJJCJO_I,572
|
|
4
|
+
morphottention/autograd.py,sha256=w07iFr_y8D7qMBlCbbdjZDtObCP5DbKXRceVe-jwyDY,4633
|
|
5
|
+
morphottention/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
morphottention-0.2.0.dist-info/METADATA,sha256=tCP9s_kBbh8fzl1kLw1a5zUVozbvBxw598XksuFgn-w,4256
|
|
7
|
+
morphottention-0.2.0.dist-info/WHEEL,sha256=UZrbbE4r80xj7Ncfa6JoeTVe-77bdXLkKUA63V8pKWQ,106
|
|
8
|
+
morphottention-0.2.0.dist-info/RECORD,,
|