PyPI - hiera-optim - Versions diffs - 0.1.0__tar.gz - Mend

hiera-optim 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

hiera_optim-0.1.0/LICENSE +21 -0
hiera_optim-0.1.0/PKG-INFO +135 -0
hiera_optim-0.1.0/README.md +101 -0
hiera_optim-0.1.0/hiera_optim/__init__.py +44 -0
hiera_optim-0.1.0/hiera_optim/adapters/__init__.py +28 -0
hiera_optim-0.1.0/hiera_optim/adapters/hiera.py +142 -0
hiera_optim-0.1.0/hiera_optim/attention/__init__.py +6 -0
hiera_optim-0.1.0/hiera_optim/attention/mask_unit.py +116 -0
hiera_optim-0.1.0/hiera_optim/checkpoint.py +113 -0
hiera_optim-0.1.0/hiera_optim/kernels/__init__.py +22 -0
hiera_optim-0.1.0/hiera_optim/kernels/flash_qpool.py +220 -0
hiera_optim-0.1.0/hiera_optim/kernels/mask_gather.py +148 -0
hiera_optim-0.1.0/hiera_optim/ops/__init__.py +18 -0
hiera_optim-0.1.0/hiera_optim/ops/mask_gather.py +112 -0
hiera_optim-0.1.0/hiera_optim/patch.py +321 -0
hiera_optim-0.1.0/hiera_optim.egg-info/PKG-INFO +135 -0
hiera_optim-0.1.0/hiera_optim.egg-info/SOURCES.txt +27 -0
hiera_optim-0.1.0/hiera_optim.egg-info/dependency_links.txt +1 -0
hiera_optim-0.1.0/hiera_optim.egg-info/requires.txt +14 -0
hiera_optim-0.1.0/hiera_optim.egg-info/top_level.txt +1 -0
hiera_optim-0.1.0/pyproject.toml +53 -0
hiera_optim-0.1.0/setup.cfg +4 -0
hiera_optim-0.1.0/tests/test_e2e_equivalence.py +111 -0
hiera_optim-0.1.0/tests/test_flash_qpool.py +122 -0
hiera_optim-0.1.0/tests/test_mask_gather.py +159 -0
hiera_optim-0.1.0/tests/test_mask_unit_attention.py +97 -0
hiera_optim-0.1.0/tests/test_matrix.py +235 -0
hiera_optim-0.1.0/tests/test_sdpa_backend.py +90 -0
hiera_optim-0.1.0/tests/test_selective_checkpoint.py +153 -0

hiera_optim-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Maxi Kalcher
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hiera_optim-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,135 @@
+Metadata-Version: 2.4
+Name: hiera-optim
+Version: 0.1.0
+Summary: Drop-in throughput and memory optimisations for FAIR Hiera (4D-SDPA, gather/scatter, Triton kernels).
+Author: Maxi Kalcher
+License: MIT
+Project-URL: Homepage, https://github.com/avocardio/hiera-optim
+Project-URL: Repository, https://github.com/avocardio/hiera-optim
+Project-URL: Issues, https://github.com/avocardio/hiera-optim/issues
+Keywords: pytorch,transformer,vision,hiera,mae,flash-attention,triton,hopper,h100,gh200
+Classifier: Development Status :: 4 - Beta
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Operating System :: POSIX :: Linux
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.5.0
+Requires-Dist: triton>=2.3.0
+Provides-Extra: hiera
+Requires-Dist: hiera-transformer>=0.1.4; extra == "hiera"
+Provides-Extra: test
+Requires-Dist: pytest>=7.0; extra == "test"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+Dynamic: license-file
+# hiera-optim
+Drop-in throughput and memory optimisations for [FAIR's Hiera](https://github.com/facebookresearch/hiera) and its MAE variant. Two lines:
+```python
+from hiera_optim import optimize
+optimize(model)
+```
+restore the model's silent math-fallback attention to FlashAttention / cuDNN-attn, replace boolean mask indexing with `torch.gather` / `scatter_`, and unblock `torch.compile`. Numerically equivalent within bf16 noise.
+## Results
+H100 (GH200), bf16, full forward + backward.
+### Production config: Hiera-Base, 224x224, 8 in-chans, B=128
+| | ms / step | samples / s | peak mem |
+|---|---|---|---|
+| FAIR baseline + `torch.compile` | 131.7 | 972 | 14.0 GB |
+| **hiera-optim + `torch.compile`** | **70.3** | **1820** | **9.4 GB** |
+| speedup / saving | 1.88x | 1.87x | 33% |
+### Across the variant matrix (444 GH200 cells)
+| | median | mean | best | worst |
+|---|---|---|---|---|
+| speedup | 1.35x | 1.42x | 2.10x | 1.10x |
+| memory ratio | 74% | 73% | 29% | 99% |
+RTX 4090, Hiera-Base, 8 in-chans, B=32: 1.81x eager, **2.86x with `torch.compile`**.
+Full matrix and per-cell numbers: [`MATRIX_RESULTS.md`](MATRIX_RESULTS.md).
+## Install
+```bash
+pip install hiera-optim
+```
+From source:
+```bash
+git clone https://github.com/avocardio/hiera-optim.git
+cd hiera-optim
+pip install -e .
+```
+Requires PyTorch >= 2.5 and Triton >= 2.3. Recognises FAIR Hiera in-tree (`models.hiera`) or via PyPI (`hiera-transformer`).
+## Usage
+```python
+import torch
+from hiera_optim import optimize
+from hiera import mae_hiera_base_224
+model = mae_hiera_base_224(pretrained=False, in_chans=3, input_size=(224, 224))
+optimize(model)
+model = torch.compile(model, mode="default", dynamic=False)
+x = torch.randn(128, 3, 224, 224, device="cuda", dtype=torch.bfloat16)
+loss, *_ = model(x, mask_ratio=0.6)
+loss.backward()
+```
+`optimize(model)` does two things, in place, weights preserved:
+1. Swap every `MaskUnitAttention` for a 4D-reshape variant so PyTorch SDPA dispatches to FlashAttention / cuDNN-attn / mem-efficient instead of math. FAIR's original feeds SDPA a 5-D tensor which the fused kernels reject, costing ~13x per call on Ada, ~6x on Hopper.
+2. Swap `x[mask.tile(...)]` and `x_dec[mask] = ...` for explicit `torch.gather` / `scatter_`. Removes a slow `indexing_backward_kernel` and the `aten::nonzero` graph break that stops `torch.compile`.
+## Optional
+```python
+from hiera_optim import optimize, enable_stage_checkpointing
+optimize(model, sdpa_backend="auto")           # per-block SDPA hint
+enable_stage_checkpointing(model, stages=(2,)) # OOM lever
+```
+## GPU support
+| Architecture | SM | Status |
+|---|---|---|
+| Ada (RTX 4090, L40) | SM89 | Tested |
+| Hopper (H100, GH200) | SM90 | Tested |
+| Ampere (A100) | SM80 | Should work |
+| Blackwell (B200) | SM100 | Should work |
+## Tests
+```bash
+pip install -e .[test]
+pytest
+```
+112 tests cover all 5 Hiera variants x q_pool {1, 2, 3} x mask ratios x bf16/fp16/fp32 x 1D/2D/3D inputs x classification + MAE.
+## License
+MIT.

hiera_optim-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,101 @@
+# hiera-optim
+Drop-in throughput and memory optimisations for [FAIR's Hiera](https://github.com/facebookresearch/hiera) and its MAE variant. Two lines:
+```python
+from hiera_optim import optimize
+optimize(model)
+```
+restore the model's silent math-fallback attention to FlashAttention / cuDNN-attn, replace boolean mask indexing with `torch.gather` / `scatter_`, and unblock `torch.compile`. Numerically equivalent within bf16 noise.
+## Results
+H100 (GH200), bf16, full forward + backward.
+### Production config: Hiera-Base, 224x224, 8 in-chans, B=128
+| | ms / step | samples / s | peak mem |
+|---|---|---|---|
+| FAIR baseline + `torch.compile` | 131.7 | 972 | 14.0 GB |
+| **hiera-optim + `torch.compile`** | **70.3** | **1820** | **9.4 GB** |
+| speedup / saving | 1.88x | 1.87x | 33% |
+### Across the variant matrix (444 GH200 cells)
+| | median | mean | best | worst |
+|---|---|---|---|---|
+| speedup | 1.35x | 1.42x | 2.10x | 1.10x |
+| memory ratio | 74% | 73% | 29% | 99% |
+RTX 4090, Hiera-Base, 8 in-chans, B=32: 1.81x eager, **2.86x with `torch.compile`**.
+Full matrix and per-cell numbers: [`MATRIX_RESULTS.md`](MATRIX_RESULTS.md).
+## Install
+```bash
+pip install hiera-optim
+```
+From source:
+```bash
+git clone https://github.com/avocardio/hiera-optim.git
+cd hiera-optim
+pip install -e .
+```
+Requires PyTorch >= 2.5 and Triton >= 2.3. Recognises FAIR Hiera in-tree (`models.hiera`) or via PyPI (`hiera-transformer`).
+## Usage
+```python
+import torch
+from hiera_optim import optimize
+from hiera import mae_hiera_base_224
+model = mae_hiera_base_224(pretrained=False, in_chans=3, input_size=(224, 224))
+optimize(model)
+model = torch.compile(model, mode="default", dynamic=False)
+x = torch.randn(128, 3, 224, 224, device="cuda", dtype=torch.bfloat16)
+loss, *_ = model(x, mask_ratio=0.6)
+loss.backward()
+```
+`optimize(model)` does two things, in place, weights preserved:
+1. Swap every `MaskUnitAttention` for a 4D-reshape variant so PyTorch SDPA dispatches to FlashAttention / cuDNN-attn / mem-efficient instead of math. FAIR's original feeds SDPA a 5-D tensor which the fused kernels reject, costing ~13x per call on Ada, ~6x on Hopper.
+2. Swap `x[mask.tile(...)]` and `x_dec[mask] = ...` for explicit `torch.gather` / `scatter_`. Removes a slow `indexing_backward_kernel` and the `aten::nonzero` graph break that stops `torch.compile`.
+## Optional
+```python
+from hiera_optim import optimize, enable_stage_checkpointing
+optimize(model, sdpa_backend="auto")           # per-block SDPA hint
+enable_stage_checkpointing(model, stages=(2,)) # OOM lever
+```
+## GPU support
+| Architecture | SM | Status |
+|---|---|---|
+| Ada (RTX 4090, L40) | SM89 | Tested |
+| Hopper (H100, GH200) | SM90 | Tested |
+| Ampere (A100) | SM80 | Should work |
+| Blackwell (B200) | SM100 | Should work |
+## Tests
+```bash
+pip install -e .[test]
+pytest
+```
+112 tests cover all 5 Hiera variants x q_pool {1, 2, 3} x mask ratios x bf16/fp16/fp32 x 1D/2D/3D inputs x classification + MAE.
+## License
+MIT.

hiera_optim-0.1.0/hiera_optim/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""hiera_optim — training-throughput optimisations for FAIR's Hiera.
+Quick start::
+    from models.hiera import mae_hiera_base_224         # FAIR Hiera
+    from hiera_optim import optimize
+    model = mae_hiera_base_224(pretrained=False, in_chans=8)
+    optimize(model)                                      # in-place
+    # optionally: model = torch.compile(model, mode="default", dynamic=False)
+What `optimize()` does:
+  1. Swaps every `MaskUnitAttention` for a FlashAttention/cuDNN-friendly
+     4-D variant (`MaskUnitAttentionFast`). Restores math-fallback SDPA to
+     fused kernel paths — 5-12× per-call attention speedup.
+  2. Replaces the boolean `x[mask.tile(...)]` and `x_dec[mask] = ...`
+     indexing patterns with explicit `torch.gather` / `scatter_`. Removes
+     the `aten::nonzero` graph break (compile-friendly).
+Optional add-ons (opt-in, not invoked by default):
+  - `optimize(model, sdpa_backend="auto" | "cudnn" | ...)`: pin the SDPA
+    backend per-block. Sometimes helps, sometimes hurts — see RESULTS.md.
+  - `enable_stage_checkpointing(model, stages=(2,))`: trade compute for
+    activation memory at chosen stages. OOM lever, not a throughput tool.
+"""
+from .patch import optimize, swap_mask_unit_attention, recommended_backend
+from .attention import MaskUnitAttentionFast, BACKEND_NAMES
+from .checkpoint import enable_stage_checkpointing, disable_stage_checkpointing
+from .adapters import HieraAdapter, get_hiera_adapter, auto_detect
+__version__ = "0.1.0"
+__all__ = [
+    "optimize",
+    "swap_mask_unit_attention",
+    "recommended_backend",
+    "MaskUnitAttentionFast",
+    "BACKEND_NAMES",
+    "enable_stage_checkpointing",
+    "disable_stage_checkpointing",
+    "HieraAdapter",
+    "get_hiera_adapter",
+    "auto_detect",
+]

hiera_optim-0.1.0/hiera_optim/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Model adapters.
+The rest of the package is intentionally model-name-free: it operates on
+PyTorch `nn.Module` graphs and looks up attention/block classes through
+adapters. Each adapter teaches `optimize()` how to find the right submodules
+on a specific model family.
+Currently bundled:
+  - hiera: FAIR's Hiera / MaskedAutoencoderHiera (https://github.com/facebookresearch/hiera)
+Other adapters (Swin, MViTv2, JEPA-Hiera, custom architectures) can plug in by
+implementing the same `ModelAdapter` protocol.
+"""
+from __future__ import annotations
+from typing import Optional
+from .hiera import HieraAdapter, get_hiera_adapter
+__all__ = ["HieraAdapter", "get_hiera_adapter", "auto_detect"]
+def auto_detect(model) -> Optional["ModelAdapter"]:  # type: ignore[name-defined]
+    """Best-effort: return the right adapter for a given model. Returns None
+    if no bundled adapter matches.
+    """
+    a = get_hiera_adapter(model)
+    if a is not None:
+        return a
+    return None

hiera_optim-0.1.0/hiera_optim/adapters/hiera.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""Adapter for FAIR Hiera (https://github.com/facebookresearch/hiera).
+This is the ONE module in `hiera_optim` allowed to import FAIR classes by name.
+Everything else works through this adapter's protocol so the package is
+trivially portable to derivative architectures.
+The adapter resolves at import time and degrades gracefully if FAIR Hiera
+isn't installed — `get_hiera_adapter(model)` returns None instead of crashing.
+"""
+from __future__ import annotations
+import importlib
+from dataclasses import dataclass
+from typing import Any, Optional, Type
+import torch
+import torch.nn as nn
+@dataclass(frozen=True)
+class _HieraSymbols:
+    """Resolved references to the FAIR Hiera classes we touch."""
+    Hiera: Type[nn.Module]
+    MaskedAutoencoderHiera: Type[nn.Module]
+    HieraBlock: Type[nn.Module]
+    MaskUnitAttention: Type[nn.Module]
+    apply_fusion_head: callable
+    undo_windowing: callable
+def _resolve() -> Optional[_HieraSymbols]:
+    """Try a few import paths. Returns None if Hiera isn't available."""
+    candidates = [
+        # In-tree (brain_atlas, the development environment)
+        ("models.hiera", "utils.hiera_utils"),
+        # PyPI package (`pip install hiera-transformer`)
+        ("hiera.hiera", "hiera.hiera_utils"),
+        ("hiera", "hiera.hiera_utils"),
+    ]
+    for hmod, hutil in candidates:
+        try:
+            hm = importlib.import_module(hmod)
+            hu = importlib.import_module(hutil)
+            return _HieraSymbols(
+                Hiera=hm.Hiera,
+                MaskedAutoencoderHiera=hm.MaskedAutoencoderHiera,
+                HieraBlock=hm.HieraBlock,
+                MaskUnitAttention=hm.MaskUnitAttention,
+                apply_fusion_head=hm.apply_fusion_head,
+                undo_windowing=hu.undo_windowing,
+            )
+        except (ImportError, AttributeError):
+            continue
+    return None
+_SYMS = _resolve()
+def is_available() -> bool:
+    return _SYMS is not None
+def symbols() -> _HieraSymbols:
+    """Return resolved FAIR symbols. Raises if Hiera isn't installed."""
+    if _SYMS is None:
+        raise ImportError(
+            "FAIR Hiera not installed. Add `pip install hiera-transformer`, "
+            "or ensure `models.hiera` is importable in this project."
+        )
+    return _SYMS
+class HieraAdapter:
+    """Describes how to introspect / patch a FAIR Hiera model.
+    The adapter is the bridge between FAIR's class hierarchy and our
+    framework-agnostic patching code. Methods are pure introspection — they
+    do not import FAIR classes unless explicitly needed.
+    """
+    def __init__(self):
+        if not is_available():
+            raise ImportError(
+                "HieraAdapter requires FAIR Hiera to be importable."
+            )
+        self._syms = symbols()
+    # ---- Introspection -----------------------------------------------------
+    def matches(self, model: nn.Module) -> bool:
+        """True if `model` is a Hiera-family model."""
+        return isinstance(model, (self._syms.Hiera, self._syms.MaskedAutoencoderHiera))
+    def is_mae(self, model: nn.Module) -> bool:
+        return isinstance(model, self._syms.MaskedAutoencoderHiera)
+    def block_class(self) -> Type[nn.Module]:
+        return self._syms.HieraBlock
+    def attention_class(self) -> Type[nn.Module]:
+        return self._syms.MaskUnitAttention
+    def encoder_blocks(self, model: nn.Module) -> nn.ModuleList:
+        """Returns the encoder block list (`model.blocks`)."""
+        return model.blocks
+    def decoder_blocks(self, model: nn.Module) -> Optional[nn.ModuleList]:
+        """Returns the MAE decoder block list, or None for non-MAE models."""
+        return getattr(model, "decoder_blocks", None)
+    def stage_ends(self, model: nn.Module) -> list[int]:
+        return list(model.stage_ends)
+    # ---- Layout convention ------------------------------------------------
+    #
+    # FAIR's `Unroll` produces a (T outer, nw inner) token layout: token n in
+    # the N axis maps to (t = n // nw, w = n % nw). This is what FAIR's
+    # `do_pool(x, stride)` (which expects stride as the OUTER dim) and the
+    # MaskUnitAttention reshape pattern both rely on.
+    layout = "T_outer_nw_inner"
+    # ---- Helpers used by the patched forwards -----------------------------
+    def apply_fusion_head(self, head: nn.Module, x: torch.Tensor) -> torch.Tensor:
+        return self._syms.apply_fusion_head(head, x)
+    def undo_windowing(self, x, shape, mu_shape):
+        return self._syms.undo_windowing(x, shape, mu_shape)
+# Module-level singleton resolution (cheap; just an isinstance check)
+_DEFAULT_ADAPTER: Optional[HieraAdapter] = None
+def get_hiera_adapter(model: nn.Module) -> Optional[HieraAdapter]:
+    """Return a HieraAdapter if the model is a Hiera-family model, else None."""
+    global _DEFAULT_ADAPTER
+    if not is_available():
+        return None
+    if _DEFAULT_ADAPTER is None:
+        _DEFAULT_ADAPTER = HieraAdapter()
+    return _DEFAULT_ADAPTER if _DEFAULT_ADAPTER.matches(model) else None

hiera_optim-0.1.0/hiera_optim/attention/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Attention modules. Currently exports MaskUnitAttentionFast — the
+FlashAttention/cuDNN-friendly 4-D variant of FAIR's MaskUnitAttention.
+"""
+from .mask_unit import MaskUnitAttentionFast, copy_weights_from_orig, BACKEND_NAMES
+__all__ = ["MaskUnitAttentionFast", "copy_weights_from_orig", "BACKEND_NAMES"]

hiera_optim-0.1.0/hiera_optim/attention/mask_unit.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Optimized MaskUnitAttention — drop-in replacement.
+Key fixes vs FAIR original (models/hiera.py):
+  - Reshape Q/K/V to 4D (B*num_windows, heads, T, D) so SDPA dispatches to
+    FlashAttention (cuDNN/Flash). Original feeds 5D tensors which fall back
+    to the math backend (12-13x slower on stage-0 shapes per microbench).
+  - Match FAIR's N-axis layout exactly: token n in input has n = t*nw + w,
+    so within-window positions are SLOW-varying and num_windows is FAST.
+    That's because the upstream Unroll module produces this interleaved
+    layout (and do_pool relies on the stride axis being outer).
+  - Skip per-tensor .contiguous() — the permute lands flash-friendly.
+  - Optional per-stage SDPA backend hint via `sdpa_backend` attribute. Set to
+    one of {"cudnn", "flash", "mem_efficient", "math", None}; None lets the
+    PyTorch dispatcher pick. Per-stage tuning is useful because Hiera's small
+    Tq stages (stage-1 post q_pool: T=16) often favor mem-efficient while the
+    long-seq global-attention stages favor cuDNN-attn or flash on Hopper.
+Numerically identical to FAIR baseline up to bf16 noise (~1e-2 max abs diff,
+~1e-3 rel RMS) — the noise is the difference between SDPA math vs flash.
+"""
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+_BACKEND_MAP = {
+    "cudnn": SDPBackend.CUDNN_ATTENTION,
+    "flash": SDPBackend.FLASH_ATTENTION,
+    "mem_efficient": SDPBackend.EFFICIENT_ATTENTION,
+    "math": SDPBackend.MATH,
+}
+BACKEND_NAMES = tuple(_BACKEND_MAP.keys())
+class MaskUnitAttentionFast(nn.Module):
+    """Drop-in replacement for models.hiera.MaskUnitAttention with 4D SDPA.
+    Args:
+        sdpa_backend: optional SDPA backend hint. One of {"cudnn", "flash",
+            "mem_efficient", "math", None}. None = default dispatcher.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        heads: int,
+        q_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+        sdpa_backend: Optional[str] = None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.heads = heads
+        self.q_stride = q_stride
+        self.head_dim = dim_out // heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, 3 * dim_out)
+        self.proj = nn.Linear(dim_out, dim_out)
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+        if sdpa_backend is not None and sdpa_backend not in _BACKEND_MAP:
+            raise ValueError(f"sdpa_backend must be one of {list(_BACKEND_MAP)} or None; got {sdpa_backend!r}")
+        self.sdpa_backend = sdpa_backend
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, _ = x.shape
+        H, D = self.heads, self.head_dim
+        nw = (N // (self.q_stride * self.window_size)) if self.use_mask_unit_attn else 1
+        T = N // nw  # tokens per window before q-pool
+        # qkv: (B, N, 3*dim_out)
+        # FAIR layout: N is interpreted as (T, nw) with nw fast-varying.
+        # We want (3, B*nw, H, T, D) so SDPA gets a 4D Q/K/V.
+        qkv = self.qkv(x).view(B, T, nw, 3, H, D)
+        # permute -> (3, B, nw, H, T, D), then flatten (B, nw) into batch
+        qkv = qkv.permute(3, 0, 2, 4, 1, 5).reshape(3, B * nw, H, T, D)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        if self.q_stride > 1:
+            # Max-pool Q over the q_stride flat axis. T = q_stride * Tq.
+            # FAIR's do_pool: view(B, stride, -1, C).max(dim=1) — stride is OUTER.
+            # So inside T, q_stride is slow-varying. Mirror: view (.., q_stride, Tq, ..)
+            Tq = T // self.q_stride
+            q = q.view(B * nw, H, self.q_stride, Tq, D).amax(dim=2)
+        # 4D SDPA → FlashAttention/cuDNN. Optionally pin a backend.
+        if self.sdpa_backend is None:
+            out = F.scaled_dot_product_attention(q, k, v)
+        else:
+            with sdpa_kernel([_BACKEND_MAP[self.sdpa_backend]]):
+                out = F.scaled_dot_product_attention(q, k, v)
+        # out: (B*nw, H, Tq, D)
+        Tq_out = out.shape[2]
+        # Back to (B, N_out, dim_out) with N_out indexed as (tq, w) tq slow / w fast
+        # out: (B*nw, H, Tq, D) -> (B, nw, H, Tq, D) -> (B, Tq, nw, H, D) -> reshape
+        out = out.view(B, nw, H, Tq_out, D).permute(0, 3, 1, 2, 4).reshape(B, Tq_out * nw, self.dim_out)
+        return self.proj(out)
+@torch.no_grad()
+def copy_weights_from_orig(fast: MaskUnitAttentionFast, orig) -> None:
+    """Copy parameters from FAIR's MaskUnitAttention into a fast one."""
+    fast.qkv.weight.copy_(orig.qkv.weight)
+    fast.qkv.bias.copy_(orig.qkv.bias)
+    fast.proj.weight.copy_(orig.proj.weight)
+    fast.proj.bias.copy_(orig.proj.bias)