PyPI - tirex-mirror - Versions diffs - 2025.10.2__tar.gz → 2025.10.7__tar.gz - Mend

tirex-mirror 2025.10.2tar.gz → 2025.10.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{tirex_mirror-2025.10.2/src/tirex_mirror.egg-info → tirex_mirror-2025.10.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tirex-mirror
-Version: 2025.10.2
+Version: 2025.10.7
 Summary: Unofficial mirror of NX-AI/tirex for packaging
 Author-email: Arpad Rozsas <rozsasarpi@gmail.com>
 License: NXAI COMMUNITY LICENSE AGREEMENT

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "tirex-mirror"
-version = "2025.10.02"
+version = "2025.10.07"
 description = "Unofficial mirror of NX-AI/tirex for packaging"
 readme = "README.md"
 requires-python = ">=3.11"

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/src/tirex/api_adapter/forecast.py RENAMED Viewed

@@ -2,15 +2,22 @@
 # This software may be used and distributed according to the terms of the NXAI Community License Agreement.
 from abc import ABC, abstractmethod
-from typing import Literal
+from functools import partial
+from math import ceil
+from typing import Literal, Optional
 import torch
+from tirex.util import frequency_resample
 from .standard_adapter import ContextType, get_batches
 DEF_TARGET_COLUMN = "target"
 DEF_META_COLUMNS = ("start", "item_id")
+# Allowed resampling strategies (extend as new strategies are implemented)
+RESAMPLE_STRATEGIES: list[str] = ["frequency"]
 def _format_output(
     quantiles: torch.Tensor,
@@ -33,6 +40,27 @@ def _format_output(
         raise ValueError(f"Invalid output type: {output_type}")
+def _pad_time_series_batch(
+    batch_series: list[torch.Tensor],
+    max_length: int,
+) -> torch.Tensor:
+    if not batch_series:
+        return torch.empty((0, max_length))
+    first = batch_series[0]
+    dtype = first.dtype if first.is_floating_point() else torch.float32
+    device = first.device
+    padded = torch.full((len(batch_series), max_length), float("nan"), dtype=dtype, device=device)
+    for idx, series in enumerate(batch_series):
+        series = series.to(padded.dtype)
+        series_len = series.shape[0]
+        padded[idx, max_length - series_len :] = series
+    return padded
 def _as_generator(batches, fc_func, quantile_levels, output_type, **predict_kwargs):
     for batch_ctx, batch_meta in batches:
         quantiles, mean = fc_func(batch_ctx, **predict_kwargs)
@@ -45,7 +73,105 @@ def _as_generator(batches, fc_func, quantile_levels, output_type, **predict_kwar
         )
-def _gen_forecast(fc_func, batches, output_type, quantile_levels, yield_per_batch, **predict_kwargs):
+def _call_fc_with_padding(fc_func, batch_series: list[torch.Tensor], **predict_kwargs):
+    if not batch_series:
+        raise ValueError("Received empty batch for forecasting")
+    max_len = max(series.shape[0] for series in batch_series)
+    padded_ts = _pad_time_series_batch(batch_series, max_len)
+    return fc_func(padded_ts, **predict_kwargs)
+def _resample_fc_func_wrapper(
+    fc_func,
+    batch,
+    resample_strategy: str,
+    max_context: int = 2016,
+    **predict_kwargs,
+):
+    # downsample the time series based on the dominant frequencies, if enabled
+    max_period = (max_context // 1000) * 500
+    prediction_length = predict_kwargs.get("prediction_length", 100)
+    batch_resampled_ts: list[torch.Tensor] = []
+    fc_resample_fns = []
+    scaling_factors = []
+    # select the function doing the resampling
+    ctx_resample_fn = lambda x: (x, 1.0, (lambda y: y))
+    match resample_strategy:
+        case "frequency":
+            ctx_resample_fn = frequency_resample
+        case _:
+            raise RuntimeError("This shouldn't happen.")
+    for series in batch:
+        resampled_ts, _sample_factor, fc_resample_fn = ctx_resample_fn(
+            series,
+            prediction_length=prediction_length,
+            max_period=max_period,
+        )
+        batch_resampled_ts.append(resampled_ts)
+        fc_resample_fns.append(fc_resample_fn)
+        scaling_factors.append(_sample_factor)
+    # Compute per-item required horizons (in downsampled domain)
+    per_item_pred_lens = [int(ceil(prediction_length * sf)) for sf in scaling_factors]
+    max_pred_len = max(per_item_pred_lens) if per_item_pred_lens else int(prediction_length)
+    predict_kwargs.update(prediction_length=max_pred_len)
+    max_ts_length = max(ts.shape[0] for ts in batch_resampled_ts)
+    padded_ts = _pad_time_series_batch(batch_resampled_ts, max_ts_length)
+    print(f"Average sample batch factor: {sum(scaling_factors) / len(scaling_factors)}")
+    # generate prediction
+    fc_quantiles, fc_mean = fc_func(padded_ts, **predict_kwargs)
+    batch_prediction_q = []
+    batch_prediction_m = []
+    for el_q, el_m, fc_resample_fn, item_pred_len in zip(fc_quantiles, fc_mean, fc_resample_fns, per_item_pred_lens):
+        # truncate the forecasts to their individual sample factor adjusted prediction lengths
+        el_q = el_q[:item_pred_len, ...]  # [T, Q]
+        el_m = el_m[:item_pred_len]  # [T]
+        # upsample prediction
+        quantiles = fc_resample_fn(el_q.squeeze(0).transpose(0, 1)).transpose(0, 1)  # [T, Q]
+        mean = fc_resample_fn(el_m.squeeze(0))
+        quantiles = quantiles[:prediction_length, ...]
+        mean = mean[:prediction_length]
+        batch_prediction_q.append(quantiles)
+        batch_prediction_m.append(mean)
+    return torch.stack(batch_prediction_q, dim=0), torch.stack(batch_prediction_m, dim=0)
+def _gen_forecast(
+    fc_func,
+    batches,
+    output_type,
+    quantile_levels,
+    yield_per_batch,
+    resample_strategy: str | None = None,
+    max_context: int = 2016,
+    **predict_kwargs,
+):
+    base_fc_func = fc_func
+    if resample_strategy is not None:
+        if resample_strategy not in RESAMPLE_STRATEGIES:
+            raise ValueError(f"Invalid resample strategy: {resample_strategy}. Allowed: {RESAMPLE_STRATEGIES}")
+        fc_func = partial(
+            _resample_fc_func_wrapper,
+            base_fc_func,
+            resample_strategy=resample_strategy,
+            max_context=max_context,
+        )
+    else:
+        fc_func = partial(_call_fc_with_padding, base_fc_func)
     if yield_per_batch:
         return _as_generator(batches, fc_func, quantile_levels, output_type, **predict_kwargs)
@@ -92,6 +218,9 @@ def _common_forecast_doc():
                                               forecasts batch by batch as they are computed.
                                               Defaults to `False`.
+            resample_strategy (Optional[str], optional): Choose a resampling strategy. Allowed values: {RESAMPLE_STRATEGIES}.
+                                                If `None`, no resampling is applied. Currently only "frequency" is supported.
             **predict_kwargs: Additional keyword arguments that are passed directly to the underlying
                               prediction mechanism of the pre-trained model. Refer to the model's
                               internal prediction method documentation for available options.
@@ -113,6 +242,11 @@ class ForecastModel(ABC):
     def _forecast_quantiles(self, batch, **predict_kwargs):
         pass
+    @property
+    def max_context_length(self) -> int:
+        # retrieve the max_context attribute of the model configuration if present
+        return getattr(getattr(self, "config", None), "max_context", 2016)
     def forecast(
         self,
         context: ContextType,
@@ -120,6 +254,7 @@ class ForecastModel(ABC):
         batch_size: int = 512,
         quantile_levels: list[float] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
         yield_per_batch: bool = False,
+        resample_strategy: Literal["frequency"] | None = None,
         **predict_kwargs,
     ):
         f"""
@@ -134,7 +269,14 @@ class ForecastModel(ABC):
         assert batch_size >= 1, "Batch size must be >= 1"
         batches = get_batches(context, batch_size)
         return _gen_forecast(
-            self._forecast_quantiles, batches, output_type, quantile_levels, yield_per_batch, **predict_kwargs
+            self._forecast_quantiles,
+            batches,
+            output_type,
+            quantile_levels,
+            yield_per_batch,
+            resample_strategy=resample_strategy,
+            max_context=self.max_context_length,
+            **predict_kwargs,
         )
     def forecast_gluon(
@@ -144,6 +286,7 @@ class ForecastModel(ABC):
         batch_size: int = 512,
         quantile_levels: list[float] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
         yield_per_batch: bool = False,
+        resample_strategy: Literal["frequency"] | None = None,
         data_kwargs: dict = {},
         **predict_kwargs,
     ):
@@ -165,7 +308,14 @@ class ForecastModel(ABC):
         batches = get_gluon_batches(gluonDataset, batch_size, **data_kwargs)
         return _gen_forecast(
-            self._forecast_quantiles, batches, output_type, quantile_levels, yield_per_batch, **predict_kwargs
+            self._forecast_quantiles,
+            batches,
+            output_type,
+            quantile_levels,
+            yield_per_batch,
+            resample_strategy=resample_strategy,
+            max_context=self.max_context_length,
+            **predict_kwargs,
         )
     def forecast_hfdata(
@@ -175,6 +325,7 @@ class ForecastModel(ABC):
         batch_size: int = 512,
         quantile_levels: list[float] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
         yield_per_batch: bool = False,
+        resample_strategy: Literal["frequency"] | None = None,
         data_kwargs: dict = {},
         **predict_kwargs,
     ):
@@ -198,5 +349,12 @@ class ForecastModel(ABC):
         batches = get_hfdata_batches(hf_dataset, batch_size, **data_kwargs)
         return _gen_forecast(
-            self._forecast_quantiles, batches, output_type, quantile_levels, yield_per_batch, **predict_kwargs
+            self._forecast_quantiles,
+            batches,
+            output_type,
+            quantile_levels,
+            yield_per_batch,
+            resample_strategy=resample_strategy,
+            max_context=self.max_context_length,
+            **predict_kwargs,
         )

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/src/tirex/api_adapter/gluon.py RENAMED Viewed

@@ -7,7 +7,7 @@ from gluonts.dataset.common import Dataset
 from gluonts.dataset.field_names import FieldName
 from gluonts.model.forecast import QuantileForecast
-from .standard_adapter import _batch_pad_iterable
+from .standard_adapter import _batch_iterable
 DEF_TARGET_COLUMN = FieldName.TARGET  # target
 DEF_META_COLUMNS = (FieldName.START, FieldName.ITEM_ID)
@@ -27,7 +27,7 @@ def _get_gluon_ts_map(**gluon_kwargs):
 def get_gluon_batches(gluonDataset: Dataset, batch_size: int, **gluon_kwargs):
-    return _batch_pad_iterable(map(_get_gluon_ts_map(**gluon_kwargs), gluonDataset), batch_size)
+    return _batch_iterable(map(_get_gluon_ts_map(**gluon_kwargs), gluonDataset), batch_size)
 def format_gluonts_output(quantile_forecasts: torch.Tensor, mean_forecasts, meta: list[dict], quantile_levels):

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/src/tirex/api_adapter/hf_data.py RENAMED Viewed

@@ -4,7 +4,7 @@
 import datasets
 import torch
-from .standard_adapter import _batch_pad_iterable
+from .standard_adapter import _batch_iterable
 DEF_TARGET_COLUMN = "target"
@@ -35,4 +35,4 @@ def _get_hf_map(dataset: datasets.Dataset, **hf_kwargs):
 def get_hfdata_batches(hf_dataset: datasets.Dataset, batch_size: int, **hf_kwargs):
     dataset, map_func = _get_hf_map(hf_dataset, **hf_kwargs)
-    return _batch_pad_iterable(map(map_func, dataset), batch_size)
+    return _batch_iterable(map(map_func, dataset), batch_size)

tirex_mirror-2025.10.7/src/tirex/api_adapter/standard_adapter.py ADDED Viewed

@@ -0,0 +1,90 @@
+# Copyright (c) NXAI GmbH.
+# This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+import itertools
+from collections.abc import Iterable, Iterator, Sequence
+from typing import Union
+import numpy as np
+import torch
+ContextType = Union[
+    torch.Tensor,
+    np.ndarray,
+    list[torch.Tensor],
+    list[np.ndarray],
+]
+def _ensure_1d_tensor(sample) -> torch.Tensor:
+    if isinstance(sample, torch.Tensor):
+        tensor = sample
+    else:
+        tensor = torch.as_tensor(sample)
+    if tensor.ndim > 1:
+        tensor = tensor.squeeze()
+    assert tensor.ndim == 1, "Each sample must be one-dimensional"
+    return tensor
+def _batched_slice(
+    full_batch,
+    full_meta: list[dict] | None,
+    batch_size: int,
+) -> Iterator[tuple[list[torch.Tensor], list[dict]]]:
+    total = len(full_batch)
+    for start in range(0, total, batch_size):
+        batch = full_batch[start : start + batch_size]
+        meta = full_meta[start : start + batch_size] if full_meta is not None else [{} for _ in range(len(batch))]
+        batch_series = []
+        for idx in range(len(batch)):
+            sample = batch[idx]
+            tensor = _ensure_1d_tensor(sample)
+            batch_series.append(tensor)
+        yield batch_series, meta
+def _batched(iterable: Iterable, n: int):
+    it = iter(iterable)
+    while batch := tuple(itertools.islice(it, n)):
+        yield batch
+def _batch_iterable(
+    iterable: Iterable[tuple[torch.Tensor, dict | None]],
+    batch_size: int,
+) -> Iterator[tuple[list[torch.Tensor], list[dict]]]:
+    for batch in _batched(iterable, batch_size):
+        series_list: list[torch.Tensor] = []
+        meta_list: list[dict] = []
+        for sample, meta in batch:
+            tensor = _ensure_1d_tensor(sample)
+            assert len(tensor) > 0, "Each sample needs to have a length > 0"
+            series_list.append(tensor)
+            meta_list.append(meta if meta is not None else {})
+        yield series_list, meta_list
+def get_batches(context: ContextType, batch_size: int):
+    batches = None
+    if isinstance(context, torch.Tensor):
+        if context.ndim == 1:
+            context = context.unsqueeze(0)
+        assert context.ndim == 2
+        batches = _batched_slice(context, None, batch_size)
+    elif isinstance(context, np.ndarray):
+        if context.ndim == 1:
+            context = np.expand_dims(context, axis=0)
+        assert context.ndim == 2
+        batches = _batched_slice(context, None, batch_size)
+    elif isinstance(context, (list, Iterable)):
+        batches = _batch_iterable(map(lambda x: (torch.Tensor(x), None), context), batch_size)
+    if batches is None:
+        raise ValueError(f"Context type {type(context)} not supported! Supported Types: {ContextType}")
+    return batches

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/src/tirex/base.py RENAMED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) NXAI GmbH.
 # This software may be used and distributed according to the terms of the NXAI Community License Agreement.
+import logging
 import os
 from abc import ABC, abstractmethod
 from typing import Literal, TypeVar
@@ -8,7 +9,10 @@ from typing import Literal, TypeVar
 import torch
 from huggingface_hub import hf_hub_download
+from tirex.models.slstm.cell import sLSTMCellTorch
 T = TypeVar("T", bound="PretrainedModel")
+VERSION_DELIMITER = "-"
 def skip_cuda():
@@ -29,6 +33,17 @@ def parse_hf_repo_id(path):
     return "/".join(parts[0:2])
+def parse_model_string(model_string):
+    if VERSION_DELIMITER in model_string:
+        parts = model_string.split(VERSION_DELIMITER)
+        model_id, version = parts[0], parts[0]
+    else:
+        model_id = model_string
+        version = None
+    return model_id, version
 class PretrainedModel(ABC):
     REGISTRY: dict[str, "PretrainedModel"] = {}
@@ -38,7 +53,7 @@ class PretrainedModel(ABC):
     @classmethod
     def from_pretrained(
-        cls: type[T], path: str, backend: str, device: str | None = None, hf_kwargs=None, ckp_kwargs=None
+        cls: type[T], path: str, backend: str, device: str | None = None, compile=False, hf_kwargs=None, ckp_kwargs=None
     ) -> T:
         if hf_kwargs is None:
             hf_kwargs = {}
@@ -58,9 +73,10 @@ class PretrainedModel(ABC):
         model: T = cls(backend=backend, **checkpoint["hyper_parameters"])
         model.on_load_checkpoint(checkpoint)
         model.load_state_dict(checkpoint["state_dict"])
+        model = model.to(device)
-        if backend == "cuda":
-            model = model.to(device)
+        if compile and backend == "torch":
+            sLSTMCellTorch.slstm_forward = torch.compile(sLSTMCellTorch.slstm_forward, mode="max-autotune")
         return model
     @classmethod
@@ -76,6 +92,7 @@ def load_model(
     path: str,
     device: str | None = None,
     backend: Literal["torch", "cuda"] | None = None,
+    compile: bool = False,
     hf_kwargs=None,
     ckp_kwargs=None,
 ) -> PretrainedModel:
@@ -85,6 +102,7 @@ def load_model(
         path (str): Hugging Face path to the model (e.g. NX-AI/TiRex)
         device (str, optional): The device on which to load the model (e.g., "cuda:0", "cpu").
         backend (torch | cuda): What backend to use, torch or the custom CUDA kernels. Defaults to cuda when xlstm is installed, else torch.
+        compile (bool, optional): toch.compile the sLSTM cells, only works with the torch backend
         hf_kwargs (dict, optional): Keyword arguments to pass to the Hugging Face Hub download method.
         ckp_kwargs (dict, optional): Keyword arguments to pass when loading the checkpoint.
@@ -99,11 +117,14 @@ def load_model(
         backend = "torch" if skip_cuda() or not xlstm_available() else "cuda"
     try:
-        _, model_id = parse_hf_repo_id(path).split("/")
+        _, model_string = parse_hf_repo_id(path).split("/")
+        model_id, version = parse_model_string(model_string)
     except:
         raise ValueError(f"Invalid model path {path}")
     model_cls = PretrainedModel.REGISTRY.get(model_id, None)
     if model_cls is None:
         raise ValueError(f"Invalid model id {model_id}")
-    return model_cls.from_pretrained(path, device=device, backend=backend, hf_kwargs=hf_kwargs, ckp_kwargs=ckp_kwargs)
+    return model_cls.from_pretrained(
+        path, device=device, backend=backend, compile=compile, hf_kwargs=hf_kwargs, ckp_kwargs=ckp_kwargs
+    )

{tirex_mirror-2025.10.2 → tirex_mirror-2025.10.7}/src/tirex/models/slstm/cell.py RENAMED Viewed

@@ -43,13 +43,11 @@ class sLSTMCell(nn.Module):
         state = self._get_state(input, state)
         if self.backend == "torch":
-            all_states = self._impl_torch(input, state)
+            output, state = self._impl_torch(input, state)
         elif self.backend == "cuda":
-            all_states = self._impl_cuda(input, state)
+            output, state = self._impl_cuda(input, state)
-        state = all_states[:, -1]
-        output = self._permute_output(all_states[0][1:])
-        return output.to(input.dtype), state.to(input.dtype)
+        return self._permute_output(output).to(input.dtype), state.to(input.dtype)
     def _impl_torch(self, input: torch.Tensor, state: torch.Tensor) -> torch.Tensor:
         input = input.to(dtype=torch.bfloat16)
@@ -64,7 +62,7 @@ class sLSTMCell(nn.Module):
             .reshape(-1)
         )
-        return slstm_forward(input, state, recurrent_kernel, bias)[0]
+        return sLSTMCellTorch.slstm_forward(input, state, recurrent_kernel, bias)
     def _impl_cuda(self, input: torch.Tensor, state: torch.Tensor) -> torch.Tensor:
         if input.device.type != "cuda":
@@ -88,7 +86,7 @@ class sLSTMCell(nn.Module):
         input = input.permute(0, 1, 3, 2, 4).reshape(input.shape[0], input.shape[1], -1)
-        return self.func.apply(
+        all_states = self.func.apply(
             False,
             input.contiguous(),
             state.contiguous(),
@@ -96,6 +94,10 @@ class sLSTMCell(nn.Module):
             self._bias_.contiguous(),
         )
+        state = all_states[:, -1]
+        output = all_states[0][1:]
+        return output, state
     def _get_input(self, x: torch.Tensor) -> torch.Tensor:
         assert x.shape[-1] == self.config.embedding_dim * self.config.num_gates, (
             f"Input size mismatch: Expected input size {self.config.embedding_dim * self.config.num_gates}, but got {input.size(-1)}."
@@ -119,73 +121,60 @@ class sLSTMCell(nn.Module):
         return output.permute(1, 2, 0, 3)
-def slstm_forward(
-    x: torch.Tensor,  # [S, B, G*I]
-    states: torch.Tensor,  # [4, B, H] only the first is used for recurrence!
-    R: torch.Tensor,  # [K, R*H, H] - K num_heads
-    b: torch.Tensor,  # [T*H]
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    num_states = states.shape[0]
-    sequence_dim = x.shape[0]
-    # this only works for a fully-connected RNN, for a hin change this
-    num_gates_r = R.shape[2] // R.shape[1]
-    hidden_dim = R.shape[1] * R.shape[0]
-    batch_dim = x.shape[1]
-    num_heads = R.shape[0]
-    assert batch_dim == states.shape[1]
-    assert hidden_dim == states.shape[2]
-    states_all = torch.zeros(
-        [num_states, sequence_dim + 1, batch_dim, hidden_dim],
-        device=x.device,
-        dtype=x.dtype,
-    )
-    states_all[:, 0] = states
-    for i, Wx_t in enumerate(x.unbind(dim=0)):
-        Ry = (
-            states[0]
-            .reshape(batch_dim, num_heads, 1, -1)
-            .matmul(R.unsqueeze(0))
-            .reshape(batch_dim, num_heads, num_gates_r, -1)
-            .transpose(1, 2)
-            .reshape(batch_dim, -1)
-        )
-        sdtype = states.dtype
-        Wx_t, Ry, b, states = Wx_t.float(), Ry.float(), b.float(), states.float()
-        states, gates = slstm_forward_pointwise(Wx_t, Ry, b, states)
-        states = states.to(dtype=sdtype)
-        states_all[:, i + 1] = states
-    # shapes ([S, B, H], ([B,H], [B,H], [B,H])
-    return states_all, states
-def slstm_forward_pointwise(
-    Wx: torch.Tensor,  # dim [B, 4*H]
-    Ry: torch.Tensor,  # dim [B, 4*H]
-    b: torch.Tensor,  # dim [1, 4*H]
-    states: torch.Tensor,  # dim [4, B, H]
-) -> tuple[torch.Tensor, torch.Tensor]:
-    raw = Wx + Ry + b
-    iraw, fraw, zraw, oraw = torch.unbind(raw.view(raw.shape[0], 4, -1), dim=1)
-    y, c, n, m = torch.unbind(states.view(4, states.shape[1], -1), dim=0)
-    # with torch.no_grad():  # THE difference to maxg aka max_gradient (here max / max_static)
-    # Equations reference the xlstm paper on page 4: https://arxiv.org/pdf/2405.04517
-    logfplusm = m + F.logsigmoid(fraw)  # eq 15
-    if torch.all(n == 0.0):
-        mnew = iraw
-    else:
-        mnew = torch.max(iraw, logfplusm)  # eq 15
-    ogate = torch.sigmoid(oraw)  # eq 14
-    igate = torch.minimum(torch.exp(iraw - mnew), torch.ones_like(iraw))  # eq 16
-    fgate = torch.minimum(torch.exp(logfplusm - mnew), torch.ones_like(iraw))  # eq 17
-    zgate = torch.tanh(zraw)  # eq 11
-    cnew = fgate * c + igate * zgate  # eq 8
-    nnew = fgate * n + igate  # eq 9
-    hnew = ogate * cnew / nnew  # eq 10
-    # y (4, B, H), state (4, B, H)
-    return torch.stack((hnew, cnew, nnew, mnew), dim=0), torch.stack((igate, fgate, zraw, ogate), dim=0)
+class sLSTMCellTorch:
+    @staticmethod
+    def slstm_forward(
+        x: torch.Tensor,  # [S, B, G*I]
+        states: torch.Tensor,  # [4, B, H] only the first is used for recurrence!
+        R: torch.Tensor,  # [K, R*H, H] - K num_heads
+        b: torch.Tensor,  # [T*H]
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        num_gates = 4
+        num_heads = R.shape[0]
+        S, B, _ = x.shape
+        H = R.shape[1] * num_heads
+        assert states.shape == (num_gates, B, H)
+        states = states.to(R.dtype).unbind(dim=0)
+        output = []
+        for i in range(S):
+            Ry = (
+                states[0]
+                .reshape(B, num_heads, 1, -1)
+                .matmul(R.unsqueeze(0))
+                .reshape(B, num_heads, num_gates, -1)
+                .transpose(1, 2)
+                .reshape(B, -1)
+            )
+            states = sLSTMCellTorch.slstm_forward_pointwise(
+                x[i].float(), Ry.float(), b.float(), [s.float() for s in states]
+            )
+            states = [s.to(dtype=R.dtype) for s in states]
+            output.append(states[0])
+        return torch.stack(output), torch.stack(states)  # (S, B, H), 4 x (B, H)
+    @staticmethod
+    def slstm_forward_pointwise(
+        Wx: torch.Tensor,  # dim [B, 4*H]
+        Ry: torch.Tensor,  # dim [B, 4*H]
+        b: torch.Tensor,  # dim [1, 4*H]
+        states: torch.Tensor,  # dim 4 x [B, H]
+    ) -> list[torch.Tensor]:
+        y, c, n, m = states
+        raw = Wx + Ry + b
+        iraw, fraw, zraw, oraw = torch.unbind(raw.view(raw.shape[0], 4, -1), dim=1)
+        # Equations reference the xlstm paper on page 4: https://arxiv.org/pdf/2405.04517
+        logfplusm = m + F.logsigmoid(fraw)  # eq 15
+        mnew = torch.where(torch.all(n == 0.0), iraw, torch.max(iraw, logfplusm))  # eq 15
+        ogate = torch.sigmoid(oraw)  # eq 14
+        igate = torch.minimum(torch.exp(iraw - mnew), torch.ones_like(iraw))  # eq 16
+        fgate = torch.minimum(torch.exp(logfplusm - mnew), torch.ones_like(iraw))  # eq 17
+        zgate = torch.tanh(zraw)  # eq 11
+        cnew = fgate * c + igate * zgate  # eq 8
+        nnew = fgate * n + igate  # eq 9
+        hnew = ogate * cnew / nnew  # eq 10
+        return [hnew, cnew, nnew, mnew]  # 4 x (B, H)

tirex-mirror 2025.10.2__tar.gz → 2025.10.7__tar.gz

tirex-mirror 2025.10.2tar.gz → 2025.10.7tar.gz