PyPI - tirex-mirror - Versions diffs - 2025.8.28__tar.gz → 2025.9.9__tar.gz - Mend

tirex-mirror 2025.8.28tar.gz → 2025.9.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{tirex_mirror-2025.8.28/src/tirex_mirror.egg-info → tirex_mirror-2025.9.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tirex-mirror
-Version: 2025.8.28
+Version: 2025.9.9
 Summary: Unofficial mirror of NX-AI/tirex for packaging
 Author-email: Arpad Rozsas <rozsasarpi@gmail.com>
 License: NXAI COMMUNITY LICENSE AGREEMENT
@@ -65,17 +65,17 @@ License-File: LICENSE_MIRROR.txt
 License-File: NOTICE.txt
 Requires-Dist: torch
 Requires-Dist: torchvision
-Requires-Dist: xlstm
 Requires-Dist: einops
-Requires-Dist: ninja
 Requires-Dist: huggingface-hub
-Requires-Dist: lightning
 Requires-Dist: numpy
 Requires-Dist: pandas
-Requires-Dist: dacite
 Requires-Dist: tqdm
+Provides-Extra: cuda
+Requires-Dist: xlstm; extra == "cuda"
+Requires-Dist: ninja; extra == "cuda"
 Provides-Extra: notebooks
 Requires-Dist: ipykernel; extra == "notebooks"
+Requires-Dist: matplotlib; extra == "notebooks"
 Provides-Extra: gluonts
 Requires-Dist: gluonts; extra == "gluonts"
 Provides-Extra: hfdataset
@@ -83,7 +83,10 @@ Requires-Dist: datasets; extra == "hfdataset"
 Provides-Extra: test
 Requires-Dist: fev; extra == "test"
 Provides-Extra: all
+Requires-Dist: xlstm; extra == "all"
+Requires-Dist: ninja; extra == "all"
 Requires-Dist: ipykernel; extra == "all"
+Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: gluonts; extra == "all"
 Requires-Dist: datasets; extra == "all"
 Requires-Dist: fev; extra == "all"

{tirex_mirror-2025.8.28 → tirex_mirror-2025.9.9}/pyproject.toml RENAMED Viewed

@@ -1,12 +1,12 @@
 [project]
 name = "tirex-mirror"
-version = "2025.08.28"
+version = "2025.09.09"
 description = "Unofficial mirror of NX-AI/tirex for packaging"
 readme = "README.md"
 requires-python = ">=3.11"
 classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent",]
 keywords = [ "TiRex", "xLSTM", "Time Series", "Zero-shot", "Deep Learning",]
-dependencies = [ "torch", "torchvision", "xlstm", "einops", "ninja", "huggingface-hub", "lightning", "numpy", "pandas", "dacite", "tqdm",]
+dependencies = [ "torch", "torchvision", "einops", "huggingface-hub", "numpy", "pandas", "tqdm",]
 [[project.authors]]
 name = "Arpad Rozsas"
 email = "rozsasarpi@gmail.com"
@@ -23,11 +23,12 @@ Repository = "https://github.com/rozsasarpi/tirex-mirror"
 Issues = "https://github.com/rozsasarpi/tirex-mirror/issues"
 [project.optional-dependencies]
-notebooks = [ "ipykernel",]
+cuda = [ "xlstm", "ninja",]
+notebooks = [ "ipykernel", "matplotlib",]
 gluonts = [ "gluonts",]
 hfdataset = [ "datasets",]
 test = [ "fev",]
-all = [ "ipykernel", "gluonts", "datasets", "fev",]
+all = [ "xlstm", "ninja", "ipykernel", "matplotlib", "gluonts", "datasets", "fev",]
 [tool.docformatter]
 diff = false

{tirex_mirror-2025.8.28 → tirex_mirror-2025.9.9}/src/tirex/api_adapter/forecast.py RENAMED Viewed

@@ -8,7 +8,6 @@ import torch
 from .standard_adapter import ContextType, get_batches
 DEF_TARGET_COLUMN = "target"
 DEF_META_COLUMNS = ("start", "item_id")

{tirex_mirror-2025.8.28 → tirex_mirror-2025.9.9}/src/tirex/base.py RENAMED Viewed

@@ -3,13 +3,18 @@
 import os
 from abc import ABC, abstractmethod
-from typing import TypeVar
+from typing import Literal, TypeVar
+import torch
 from huggingface_hub import hf_hub_download
 T = TypeVar("T", bound="PretrainedModel")
+def skip_cuda():
+    return os.getenv("TIREX_NO_CUDA", "False").lower() in ("true", "1", "t")
 def parse_hf_repo_id(path):
     parts = path.split("/")
     return "/".join(parts[0:2])
@@ -23,19 +28,30 @@ class PretrainedModel(ABC):
         cls.REGISTRY[cls.register_name()] = cls
     @classmethod
-    def from_pretrained(cls: type[T], path, device: str = "cuda:0", hf_kwargs=None, ckp_kwargs=None) -> T:
+    def from_pretrained(
+        cls: type[T], path: str, backend: str, device: str | None = None, hf_kwargs=None, ckp_kwargs=None
+    ) -> T:
         if hf_kwargs is None:
             hf_kwargs = {}
         if ckp_kwargs is None:
             ckp_kwargs = {}
+        if device is None:
+            device = "cuda:0" if backend == "cuda" else "cpu"
         if os.path.exists(path):
             print("Loading weights from local directory")
             checkpoint_path = path
         else:
             repo_id = parse_hf_repo_id(path)
             checkpoint_path = hf_hub_download(repo_id=repo_id, filename="model.ckpt", **hf_kwargs)
-        model = cls.load_from_checkpoint(checkpoint_path, map_location=device, **ckp_kwargs)
-        model.after_load_from_checkpoint()
+        # load lightning checkpoint
+        checkpoint = torch.load(checkpoint_path, map_location=device, **ckp_kwargs, weights_only=True)
+        model: T = cls(backend=backend, **checkpoint["hyper_parameters"])
+        model.on_load_checkpoint(checkpoint)
+        model.load_state_dict(checkpoint["state_dict"])
+        if backend == "cuda":
+            model = model.to(device)
         return model
     @classmethod
@@ -43,17 +59,22 @@ class PretrainedModel(ABC):
     def register_name(cls) -> str:
         pass
-    def after_load_from_checkpoint(self):
+    def on_load_checkpoint(self):
         pass
-def load_model(path: str, device: str = "cuda:0", hf_kwargs=None, ckp_kwargs=None) -> PretrainedModel:
+def load_model(
+    path: str,
+    device: str | None = None,
+    backend: Literal["torch", "cuda"] | None = None,
+    hf_kwargs=None,
+    ckp_kwargs=None,
+) -> PretrainedModel:
     """Loads a TiRex model. This function attempts to load the specified model.
     Args:
         path (str): Hugging Face path to the model (e.g. NX-AI/TiRex)
         device (str, optional): The device on which to load the model (e.g., "cuda:0", "cpu").
-                                If you want to use "cpu" you need to deactivate the sLSTM CUDA kernels (check repository FAQ!).
         hf_kwargs (dict, optional): Keyword arguments to pass to the Hugging Face Hub download method.
         ckp_kwargs (dict, optional): Keyword arguments to pass when loading the checkpoint.
@@ -63,6 +84,11 @@ def load_model(path: str, device: str = "cuda:0", hf_kwargs=None, ckp_kwargs=Non
     Examples:
         model: ForecastModel = load_model("NX-AI/TiRex")
     """
+    if backend is None:
+        backend = "torch" if skip_cuda() else "cuda"
+    assert backend in ["torch", "cuda"], f"Backend can either be torch or cuda, not {backend}!"
     try:
         _, model_id = parse_hf_repo_id(path).split("/")
     except:
@@ -70,4 +96,5 @@ def load_model(path: str, device: str = "cuda:0", hf_kwargs=None, ckp_kwargs=Non
     model_cls = PretrainedModel.REGISTRY.get(model_id, None)
     if model_cls is None:
         raise ValueError(f"Invalid model id {model_id}")
-    return model_cls.from_pretrained(path, device=device, hf_kwargs=hf_kwargs, ckp_kwargs=ckp_kwargs)
+    return model_cls.from_pretrained(path, device=device, backend=backend, hf_kwargs=hf_kwargs, ckp_kwargs=ckp_kwargs)

tirex_mirror-2025.9.9/src/tirex/models/patcher.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Copyright (c) NXAI GmbH.
+# This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+from dataclasses import dataclass
+import torch
+class StandardScaler:
+    def __init__(self, eps: float = 1e-5, nan_loc: float = 0.0):
+        self.eps = eps
+        self.nan_loc = nan_loc
+    def scale(
+        self,
+        x: torch.Tensor,
+        loc_scale: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if loc_scale is None:
+            loc = torch.nan_to_num(torch.nanmean(x, dim=-1, keepdim=True), nan=self.nan_loc)
+            scale = torch.nan_to_num(torch.nanmean((x - loc).square(), dim=-1, keepdim=True).sqrt(), nan=1.0)
+            scale = torch.where(scale == 0, torch.abs(loc) + self.eps, scale)
+        else:
+            loc, scale = loc_scale
+        return ((x - loc) / scale), (loc, scale)
+    def re_scale(self, x: torch.Tensor, loc_scale: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        loc, scale = loc_scale
+        return x * scale + loc
+class Patcher:
+    def __init__(self, patch_size: int, patch_stride: int, left_pad: bool):
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.left_pad = left_pad
+        assert self.patch_size % self.patch_stride == 0
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.ndim == 2
+        length = x.shape[-1]
+        if length < self.patch_size or (length % self.patch_stride != 0):
+            if length < self.patch_size:
+                padding_size = (
+                    *x.shape[:-1],
+                    self.patch_size - (length % self.patch_size),
+                )
+            else:
+                padding_size = (
+                    *x.shape[:-1],
+                    self.patch_stride - (length % self.patch_stride),
+                )
+            padding = torch.full(size=padding_size, fill_value=torch.nan, dtype=x.dtype, device=x.device)
+            if self.left_pad:
+                x = torch.concat((padding, x), dim=-1)
+            else:
+                x = torch.concat((x, padding), dim=-1)
+        return x.unfold(dimension=-1, size=self.patch_size, step=self.patch_stride)
+@dataclass
+class PatchedUniTokenizerState:
+    scale_state: float
+class PatchedUniTokenizer:
+    def __init__(self, patch_size: int, patch_stride: int | None = None, scaler: StandardScaler | None = None):
+        self.patch_size = patch_size
+        self.patch_stride = patch_size if patch_stride is None else patch_stride
+        self.scaler = StandardScaler() if scaler is None else scaler
+        self.patcher = Patcher(self.patch_size, self.patch_stride, left_pad=True)
+    def context_input_transform(self, data: torch.Tensor):
+        assert data.ndim == 2
+        data, scale_state = self.scaler.scale(data)
+        return self.patcher(data), PatchedUniTokenizerState(scale_state)
+    def output_transform(self, data: torch.Tensor, tokenizer_state: PatchedUniTokenizerState):
+        data_shape = data.shape
+        data = self.scaler.re_scale(data.reshape(data_shape[0], -1), tokenizer_state.scale_state).view(*data_shape)
+        return data

tirex_mirror-2025.9.9/src/tirex/models/slstm/block.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright (c) NXAI GmbH.
+# This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tirex.models.slstm.layer import sLSTMBlockConfig, sLSTMLayer
+from tirex.util import round_up_to_next_multiple_of
+class sLSTMBlock(nn.Module):
+    def __init__(self, config: sLSTMBlockConfig, backend: str):
+        super().__init__()
+        self.config = config
+        self.norm_slstm = RMSNorm(config.embedding_dim)
+        self.slstm_layer = sLSTMLayer(config, backend)
+        self.norm_ffn = RMSNorm(config.embedding_dim)
+        up_proj_dim = round_up_to_next_multiple_of(config.embedding_dim * config.ffn_proj_factor, 64)
+        self.ffn = FeedForward(config.embedding_dim, up_proj_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_slstm = self.norm_slstm(x)
+        x_slstm = self.slstm_layer(x_slstm, slstm_state=None)
+        x = x + x_slstm
+        x_ffn = self.norm_ffn(x)
+        x_ffn = self.ffn(x_ffn)
+        x = x + x_ffn
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, embedding_dim: int, up_proj_dim: int):
+        super().__init__()
+        self.proj_up_gate = nn.Linear(embedding_dim, up_proj_dim, bias=False)
+        self.proj_up = nn.Linear(embedding_dim, up_proj_dim, bias=False)
+        self.proj_down = nn.Linear(up_proj_dim, embedding_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.proj_up_gate(x)) * self.proj_up(x)
+        y = self.proj_down(x)
+        return y
+class RMSNorm(nn.Module):
+    def __init__(self, num_features: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(num_features))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._rms_normalize(x.float()).to(x.dtype)
+        x = x * self.weight
+        return x
+    def _rms_normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)

tirex_mirror-2025.9.9/src/tirex/models/slstm/cell.py ADDED Viewed

@@ -0,0 +1,188 @@
+# Copyright (c) NXAI GmbH.
+# This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+import warnings
+from dataclasses import asdict, dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tirex.util import dataclass_from_dict
+@dataclass
+class sLSTMBlockConfig:
+    embedding_dim: int
+    num_heads: int
+    num_blocks: int
+    ffn_proj_factor: float = 2.6667
+    num_states: int = 4  # this is for the sLSTM, a standard LSTM  has 2
+    num_gates: int = 4
+    @property
+    def head_dim(self):
+        return self.embedding_dim // self.num_heads
+class sLSTMCell(nn.Module):
+    def __init__(self, config: sLSTMBlockConfig, backend: str):
+        super().__init__()
+        self.config = config
+        self.backend = backend
+        self._recurrent_kernel_ = nn.Parameter(
+            torch.empty((config.num_heads, config.head_dim, config.num_gates * config.head_dim), dtype=None)
+        )
+        self._bias_ = nn.Parameter(torch.empty((config.num_heads * config.num_gates * config.head_dim), dtype=None))
+    def forward(self, input: torch.Tensor, state: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        input = self._get_input(input)
+        state = self._get_state(input, state)
+        if self.backend == "torch":
+            all_states = self._impl_torch(input, state)
+        elif self.backend == "cuda":
+            all_states = self._impl_cuda(input, state)
+        state = all_states[:, -1]
+        output = self._permute_output(all_states[0][1:])
+        return output.to(input.dtype), state.to(input.dtype)
+    def _impl_torch(self, input: torch.Tensor, state: torch.Tensor) -> torch.Tensor:
+        input = input.to(dtype=torch.bfloat16)
+        state = state.to(dtype=torch.bfloat16)
+        recurrent_kernel = self._recurrent_kernel_.to(dtype=torch.bfloat16)
+        bias = self._bias_.to(dtype=torch.float32)
+        input = input.view(input.shape[0], input.shape[1], -1)
+        bias = (
+            bias.reshape(self.config.num_heads, self.config.num_gates, self.config.head_dim)
+            .permute(1, 0, 2)
+            .reshape(-1)
+        )
+        return slstm_forward(input, state, recurrent_kernel, bias)[0]
+    def _impl_cuda(self, input: torch.Tensor, state: torch.Tensor) -> torch.Tensor:
+        if input.device.type != "cuda":
+            warnings.warn(
+                f"You use TiRex with sLSTM CUDA kernels BUT DO NOT LOAD THE DEVICE ON A CUDA DEVICE (device type is {input.device.type})!"
+                "This is not supported and calls to the model will likely lead to an error if you dont move your model to a CUDA device!"
+                "If you want to run TiRex on CPU you need to disable sLSTM CUDA kernels but be aware of the downsides (see FAQ)"
+            )
+        if not hasattr(self, "func"):
+            try:
+                from xlstm.blocks.slstm.cell import sLSTMCellConfig as sLSTMCellConfigCuda, sLSTMCellFuncGenerator
+            except ModuleNotFoundError:
+                raise ValueError(
+                    'xlstm package not found! To use the custom cuda backend, install the additional dependencies with: pip install -e ".[cuda]"'
+                )
+            cuda_config = dataclass_from_dict(
+                sLSTMCellConfigCuda, {**asdict(self.config), "hidden_size": self.config.embedding_dim}
+            )
+            self.func = sLSTMCellFuncGenerator(False, cuda_config)
+        input = input.permute(0, 1, 3, 2, 4).reshape(input.shape[0], input.shape[1], -1)
+        return self.func.apply(
+            False,
+            input.contiguous(),
+            state.contiguous(),
+            self._recurrent_kernel_.contiguous(),
+            self._bias_.contiguous(),
+        )
+    def _get_input(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[-1] == self.config.embedding_dim * self.config.num_gates, (
+            f"Input size mismatch: Expected input size {self.config.embedding_dim * self.config.num_gates}, but got {input.size(-1)}."
+        )
+        return x.view(x.shape[0], x.shape[1], self.config.num_gates, self.config.num_heads, -1).permute(1, 0, 2, 3, 4)
+    def _get_state(self, input: torch.Tensor, state: torch.Tensor | None) -> torch.Tensor:
+        B = input.shape[1]
+        if state is None:
+            state = torch.zeros(
+                (self.config.num_states, B, self.config.embedding_dim),
+                dtype=input.dtype,
+                device=input.device,
+            )
+        assert state.shape == (self.config.num_states, B, self.config.embedding_dim)
+        return state
+    def _permute_output(self, output: torch.Tensor) -> torch.Tensor:
+        output = output.view(output.shape[0], output.shape[1], self.config.num_heads, self.config.head_dim)
+        return output.permute(1, 2, 0, 3)
+def slstm_forward(
+    x: torch.Tensor,  # [S, B, G*I]
+    states: torch.Tensor,  # [4, B, H] only the first is used for recurrence!
+    R: torch.Tensor,  # [K, R*H, H] - K num_heads
+    b: torch.Tensor,  # [T*H]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_states = states.shape[0]
+    sequence_dim = x.shape[0]
+    # this only works for a fully-connected RNN, for a hin change this
+    num_gates_r = R.shape[2] // R.shape[1]
+    hidden_dim = R.shape[1] * R.shape[0]
+    batch_dim = x.shape[1]
+    num_heads = R.shape[0]
+    assert batch_dim == states.shape[1]
+    assert hidden_dim == states.shape[2]
+    states_all = torch.zeros(
+        [num_states, sequence_dim + 1, batch_dim, hidden_dim],
+        device=x.device,
+        dtype=x.dtype,
+    )
+    states_all[:, 0] = states
+    for i, Wx_t in enumerate(x.unbind(dim=0)):
+        Ry = (
+            states[0]
+            .reshape(batch_dim, num_heads, 1, -1)
+            .matmul(R.unsqueeze(0))
+            .reshape(batch_dim, num_heads, num_gates_r, -1)
+            .transpose(1, 2)
+            .reshape(batch_dim, -1)
+        )
+        sdtype = states.dtype
+        Wx_t, Ry, b, states = Wx_t.float(), Ry.float(), b.float(), states.float()
+        states, gates = slstm_forward_pointwise(Wx_t, Ry, b, states)
+        states = states.to(dtype=sdtype)
+        states_all[:, i + 1] = states
+    # shapes ([S, B, H], ([B,H], [B,H], [B,H])
+    return states_all, states
+def slstm_forward_pointwise(
+    Wx: torch.Tensor,  # dim [B, 4*H]
+    Ry: torch.Tensor,  # dim [B, 4*H]
+    b: torch.Tensor,  # dim [1, 4*H]
+    states: torch.Tensor,  # dim [4, B, H]
+) -> tuple[torch.Tensor, torch.Tensor]:
+    raw = Wx + Ry + b
+    y, c, n, m = torch.unbind(states.view(4, states.shape[1], -1), dim=0)
+    iraw, fraw, zraw, oraw = torch.unbind(raw.view(raw.shape[0], 4, -1), dim=1)
+    # with torch.no_grad():  # THE difference to maxg aka max_gradient (here max / max_static)
+    logfplusm = m + F.logsigmoid(fraw)
+    if torch.all(n == 0.0):
+        mnew = iraw
+    else:
+        mnew = torch.max(iraw, logfplusm)
+    ogate = torch.sigmoid(oraw)
+    igate = torch.minimum(torch.exp(iraw - mnew), torch.ones_like(iraw))
+    fgate = torch.minimum(torch.exp(logfplusm - mnew), torch.ones_like(iraw))
+    cnew = fgate * c + igate * torch.tanh(zraw)
+    nnew = fgate * n + igate
+    ynew = ogate * cnew / nnew
+    # shapes ([B,H], [B,H], [B,H]), ([B,H],[B,H],[B,H],[B,H])
+    return torch.stack((ynew, cnew, nnew, mnew), dim=0), torch.stack((igate, fgate, zraw, ogate), dim=0)

tirex_mirror-2025.9.9/src/tirex/models/slstm/layer.py ADDED Viewed

@@ -0,0 +1,67 @@
+# Copyright (c) NXAI GmbH.
+# This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .cell import sLSTMBlockConfig, sLSTMCell
+class sLSTMLayer(nn.Module):
+    def __init__(self, config: sLSTMBlockConfig, backend: str):
+        super().__init__()
+        self.config = config
+        in_features, num_heads = self.config.embedding_dim, self.config.num_heads
+        self.fgate = LinearHeadwiseExpand(in_features, num_heads)
+        self.igate = LinearHeadwiseExpand(in_features, num_heads)
+        self.zgate = LinearHeadwiseExpand(in_features, num_heads)
+        self.ogate = LinearHeadwiseExpand(in_features, num_heads)
+        self.slstm_cell = sLSTMCell(self.config, backend)
+        self.group_norm = MultiHeadLayerNorm(ndim=in_features)
+    def forward(self, x: torch.Tensor, slstm_state: torch.Tensor | None = None) -> torch.Tensor:
+        x_g = torch.cat((self.fgate(x), self.igate(x), self.zgate(x), self.ogate(x)), dim=-1)
+        y, slstm_state = self.slstm_cell(x_g, state=slstm_state)
+        return self.group_norm(y).transpose(1, 2).view(x.shape[0], x.shape[1], -1)
+class LinearHeadwiseExpand(nn.Module):
+    def __init__(self, in_features, num_heads, expand_factor_up: float = 1):
+        super().__init__()
+        assert num_heads <= in_features, "num_heads must be <= in_features"
+        assert in_features % num_heads == 0, "in_features must be a multiple of num_heads"
+        self.num_heads = num_heads
+        out_features = round(expand_factor_up * in_features)
+        out_features_per_head = out_features // num_heads
+        self.weight = nn.Parameter(torch.empty(num_heads, out_features_per_head, in_features // num_heads))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shape = x.shape
+        x = x.view(*shape[:-1], self.num_heads, -1)
+        x = torch.einsum("...hd,hod->...ho", x, self.weight)
+        x = x.reshape(*shape[:-1], -1)
+        return x
+class MultiHeadLayerNorm(nn.Module):
+    def __init__(self, ndim: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(ndim))
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        assert input.dim() == 4, "Input must be 4D tensor (B, NH, S, DH)"
+        B, NH, S, DH = input.shape
+        gn_in_1 = input.transpose(1, 2)  # (B, S, NH, DH)
+        gn_in_2 = gn_in_1.reshape(B * S, NH * DH)  # (B * S, NH * DH)
+        residual_weight = 1.0 + self.weight
+        out = F.group_norm(gn_in_2, num_groups=NH, weight=residual_weight)
+        # (B * S), (NH * DH) -> (B, S, NH, DH) -> (B, NH, S, DH)
+        out = out.view(B, S, NH, DH).transpose(1, 2)
+        return out

tirex-mirror 2025.8.28__tar.gz → 2025.9.9__tar.gz

tirex-mirror 2025.8.28tar.gz → 2025.9.9tar.gz