PyPI - minmaxrnc - Versions diffs - 0.1.0__py3-none-any.whl - Mend

minmaxrnc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

minmaxrnc/__init__.py +12 -0
minmaxrnc/minmax_layer.py +134 -0
minmaxrnc/minmax_neuron.py +148 -0
minmaxrnc/minmax_operator.py +39 -0
minmaxrnc/minmax_rnc.py +281 -0
minmaxrnc/minmax_rnc_lm.py +88 -0
minmaxrnc/minmax_scan.py +60 -0
minmaxrnc/modules/basic_conv.py +77 -0
minmaxrnc/modules/feedforward.py +167 -0
minmaxrnc/modules/gated_conv.py +78 -0
minmaxrnc/modules/initialisers.py +60 -0
minmaxrnc-0.1.0.dist-info/METADATA +329 -0
minmaxrnc-0.1.0.dist-info/RECORD +17 -0
minmaxrnc-0.1.0.dist-info/WHEEL +5 -0
minmaxrnc-0.1.0.dist-info/licenses/LICENSE +143 -0
minmaxrnc-0.1.0.dist-info/licenses/NOTICE +10 -0
minmaxrnc-0.1.0.dist-info/top_level.txt +1 -0

minmaxrnc/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+from .minmax_rnc     import MinMaxRNC, MinMaxRNCConfig
+from .minmax_rnc_lm  import MinMaxRNC_LM, MinMaxRNCLMConfig
+__all__ = [
+    "MinMaxRNC",
+    "MinMaxRNCConfig",
+    "MinMaxRNC_LM",
+    "MinMaxRNCLMConfig",
+]

minmaxrnc/minmax_layer.py ADDED Viewed

@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+from typing import Sequence, Optional, List, Tuple, Union, Literal
+from dataclasses import dataclass, replace
+import torch
+import torch.nn as nn
+from .minmax_neuron import MinMaxNeuron, MinMaxNeuronConfig
+from .modules.feedforward import FeedForwardConfig, create_feedforward
+from .modules.basic_conv import BasicConv, BasicConvConfig
+from .modules.gated_conv import GatedConv, GatedConvConfig
+NormType = Literal['none', 'layernorm', 'rmsnorm']
+@dataclass(frozen=True)
+class MinMaxLayerConfig:
+    """
+    Configuration for one MinMax Layer.
+    This is normally constructed automatically by MinMaxRNCConfig.layer_cfg;
+    direct construction is only needed for non-standard layer shapes.
+    Fields
+    ------
+    neuron : MinMaxNeuronConfig
+        Config for the MinMax Neuron sub-module.
+    conv : BasicConvConfig | GatedConvConfig
+        Config for the short-range convolution applied before the FFN.
+    d_model : int
+        Residual-stream width (must match neuron.d_model).
+    first_in_dropout : float
+        Dropout probability for the FFN in the *first* layer only.  Allows a
+        higher input-level dropout without affecting deeper layers.
+    feedforward : FeedForwardConfig | None
+        Config for the feed-forward sub-layer.  Currently required (None
+        is rejected at construction time).
+    norm : 'none' | 'layernorm' | 'rmsnorm'
+        Pre-norm applied before each of the three sub-layers (conv, FFN,
+        neuron).
+    """
+    neuron:           MinMaxNeuronConfig
+    conv:             Union[BasicConvConfig, GatedConvConfig]
+    d_model:          int
+    first_in_dropout: float                       = 0.0
+    feedforward:      Optional[FeedForwardConfig] = None
+    norm:             NormType                    = 'layernorm'
+class MinMaxLayer(nn.Module):
+    """
+    One residual layer of the MinMax RNC backbone.
+    Internal data flow (all operations use pre-norm and residual connections):
+        conv_out  = Conv( norm(u) )          # short-range context
+        ffn_out   = FFN( norm(u + conv_out) )
+        neur_out  = Neuron( norm(u + ffn_out) )
+        output    = u + neur_out
+    """
+    def __init__(self, cfg: MinMaxLayerConfig, first: bool):
+        super().__init__()
+        self.cfg = cfg
+        if type(cfg.conv) == BasicConvConfig:
+            self.conv = BasicConv(cfg.conv)
+        else:
+            self.conv = GatedConv(cfg.conv)
+        self.neuron = MinMaxNeuron(cfg.neuron)
+        self.use_ffn = (cfg.feedforward is not None)
+        assert self.use_ffn
+        ffn_dropout = cfg.feedforward.dropout
+        if first:
+            ffn_dropout = cfg.first_in_dropout
+        self.ffn = create_feedforward(
+            config=replace(
+                cfg.feedforward,
+                embedding_dim=cfg.d_model,
+                embedding_dim_out=cfg.d_model,
+                dropout=ffn_dropout
+            )
+        )
+        if self.cfg.norm == 'layernorm':
+            self.norm_ffn  = nn.LayerNorm(cfg.d_model)
+            self.norm_neuron = nn.LayerNorm(cfg.d_model)
+            self.norm_conv = nn.LayerNorm(cfg.d_model)
+        elif self.cfg.norm == 'rmsnorm':
+            self.norm_ffn  = nn.RMSNorm(cfg.d_model)
+            self.norm_neuron = nn.RMSNorm(cfg.d_model)
+            self.norm_conv = nn.RMSNorm(cfg.d_model)
+        else:
+            self.norm_conv = nn.Identity()
+            self.norm_ffn  = nn.Identity()
+            self.norm_neuron = nn.Identity()
+    @property
+    def initial_state(self):
+        return {
+            'neuron': self.neuron.initial_state,
+            'conv':   self.conv.initial_state
+        }
+    def forward(self, u: torch.Tensor, state: dict):
+        conv_in = self.norm_conv(u)
+        conv, conv_state = self.conv(conv_in, state['conv'])
+        ffn_in = self.norm_ffn(u + conv)
+        ffn = self.ffn(ffn_in)
+        neuron_in = self.norm_neuron(u + ffn)
+        neuron, neuron_state = self.neuron(neuron_in, state['neuron'])
+        output = u + neuron
+        state = {'conv': conv_state, 'neuron': neuron_state}
+        return output, state

minmaxrnc/minmax_neuron.py ADDED Viewed

@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+from .modules.initialisers import wang_init_, small_init_init_
+from . import minmax_scan
+@dataclass(frozen=True)
+class MinMaxNeuronConfig:
+    """
+    Configuration for a single MinMax Neuron.
+    Fields
+    ------
+    _num_blocks : int
+        Total number of residual blocks in the enclosing model.  Used to scale
+        the output projection at initialisation (wang_init_) so that the
+        combined contribution of all blocks to the residual stream stays O(1).
+    d_model : int
+        Dimension of the input and output (the residual-stream width).
+    d_state : int
+        Dimension of the hidden state x_t.  Larger values give the neuron
+        more memory capacity but increase parameter count linearly.
+    dropout : float
+        Dropout probability applied to the input u before projection.
+    train_init : bool
+        If True, the initial hidden state x_0 is a learned parameter.
+        If False (default), x_0 is fixed at zero.
+    output_gate : bool
+        If True, the output is element-wise multiplied by a sigmoid-gated
+        projection of the input before the final linear: y = W_o(x ⊙ σ(W_g u)).
+    """
+    _num_blocks: int
+    d_model:     int
+    d_state:     int
+    dropout:     float    = 0.0
+    train_init:  bool     = False
+    output_gate: bool     = True
+class MinMaxNeuron(nn.Module):
+    """
+    The core recurrent cell of the MinMax RNC.
+    Maintains a hidden state x_t ∈ R^D updated by the MinMax recurrence:
+        x_{t+1} = max(min(r_t, x_t), s_t)
+    All states for a sequence of length T are computed simultaneously via a parallel prefix scan in
+    O(log T) depth instead of O(T).
+    Output projection:
+        y_t = W_o x_t                             (output_gate=False)
+        y_t = W_o (x_t ⊙ W_g u_t)                 (output_gate=True)
+    """
+    def __init__(self, cfg: MinMaxNeuronConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.I = I = cfg.d_model
+        self.D = D = cfg.d_state
+        self._initial_state = nn.Parameter(torch.zeros(D), requires_grad=cfg.train_init)
+        self.drop = nn.Dropout(cfg.dropout)
+        self.s = nn.Linear(I, D)
+        self.r = nn.Linear(I, D)
+        self.o = nn.Linear(D, I)
+        if cfg.output_gate:
+            self.o_g = nn.Linear(I, D)
+        self.reset()
+    def reset(self):
+        # Init 's'
+        small_init_init_(self.s.weight, dim=self.I)
+        if self.s.bias is not None:
+            nn.init.zeros_(self.s.bias)
+        small_init_init_(self.r.weight, dim=self.I)
+        # Init 'r'
+        if self.r.bias is not None:
+            nn.init.zeros_(self.r.bias)
+        # Init 'o'
+        wang_init_(
+            self.o.weight,
+            dim=self.I,
+            num_blocks= self.cfg._num_blocks,
+        )
+        if self.o.bias is not None:
+            nn.init.zeros_(self.o.bias)
+    @property
+    def initial_state(self):
+        return self._initial_state
+    def forward(self, u: torch.Tensor, state: torch.Tensor):
+        """
+        Compute updated state for a sequence using closed form with initial state.
+        u:     (B, T, I)
+        state: (1/B,D,)   (state before the first step in the input sequence)
+        Returns: 1) sequence of outputs: (B, T, I)
+                 2) last state:          (B, D)
+        """
+        B, T, I = u.shape
+        D = self.D
+        device = u.device
+        if state.dim() == 1: # state is the initial initial state
+            x0 = state.unsqueeze(0).expand(B,D)
+        else:
+            x0 = state
+        # Shape of u:  (B,T,I)
+        # Shape of x0: (B,D)
+        u = self.drop(u)  # (B,T,I)
+        s = self.s(u)     # (B,T,D)
+        r = self.r(u)     # (B,T,D)
+        x_post = minmax_scan.all_states(r, s, x0)
+        x_post = x_post[:,1:,:]
+        # ----- Compute outputs -----
+        x_latest = x_post[:,-1,:]         # (B,T,D)
+        if self.cfg.output_gate:
+            x_post = x_post * self.o_g(u) # (B,T,D)
+        output = self.o(x_post)           # (B,T,I)
+        return output, x_latest

minmaxrnc/minmax_operator.py ADDED Viewed

@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+import torch
+def apply(a: torch.Tensor, b: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """
+    Apply the MinMax scalar operator f(x) = max(min(a, x), b)
+    where
+    - min and max are applied element-wise,
+    - shapes (broadcastable):
+      a: (..., D)
+      b: (..., D)
+      x: (..., D)
+    """
+    return torch.maximum(torch.minimum(a, x), b)
+def compose(a2: torch.Tensor, b2: torch.Tensor, a1: torch.Tensor, b1: torch.Tensor):
+    """
+    Compose MinMax scalar operators.
+    Given
+        a1,b1, a2,b2
+    having shape (..., D) and representing the MinMax scalar operators
+        f1(x) = max(min(a1, x), b1),
+        f2(x) = max(min(a2, x), b2),
+    return
+        a = min(a2, a1)
+        b = max(min(a2, b1), b2)
+    corresponding to the MinMax scalar operator
+        f(x) = f2(f1(x)) = max(min(a, x), b)
+    """
+    a = torch.minimum(a2, a1)
+    b = torch.maximum(torch.minimum(a2, b1), b2)
+    return a, b

minmaxrnc/minmax_rnc.py ADDED Viewed

@@ -0,0 +1,281 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+import torch
+import torch.nn as nn
+from typing      import Optional, Literal, Union
+from dataclasses import dataclass, replace
+from .minmax_layer              import MinMaxLayer, MinMaxLayerConfig, NormType
+from .minmax_neuron             import MinMaxNeuronConfig
+from .modules.feedforward       import FeedForwardConfig, FFType, InitType, create_feedforward
+from .modules.basic_conv        import BasicConvConfig
+from .modules.gated_conv        import GatedConvConfig
+ConvType = Literal['basic', 'gated']
+@dataclass(frozen=True)
+class MinMaxRNCConfig:
+    """
+    Configuration for the MinMax RNC backbone.
+    Configs for neuron, conv, FFN, and layer are derived.
+    Core architecture
+    -----------------
+    d_model : int
+        Residual-stream width.  Every sub-module input and output has this
+        dimension.
+    n_layers : int
+        Number of stacked MinMaxLayers.
+    d_state : int
+        Hidden-state dimension of each MinMax Neuron.  Independent of d_model;
+        larger values increase memory capacity at linear parameter cost.
+    Normalisation
+    -------------
+    norm : 'layernorm' | 'rmsnorm' | 'none'
+        Pre-norm type applied before each sub-layer inside each layer.
+        'layernorm' (default) is stable; 'rmsnorm' is slightly faster;
+        'none' disables normalisation entirely.
+    postlayers_norm : 'layernorm' | 'rmsnorm' | 'none'
+        Norm applied to the output of the final layer (before the optional
+        post-layers FFN).
+    Feed-forward network (within each layer)
+    -----------------------------------------
+    ffn_type : 'gated' | 'basic'
+        'gated' (default) — gated FFN (ReGLU / SwiGLU depending on
+        act_fn), from Shazeer (2020).  'basic' — standard two-layer MLP.
+    ffn_proj_factor : float
+        Hidden-layer expansion factor relative to d_model.  The hidden
+        dimension is rounded to the nearest multiple of 2.
+    ffn_act_fn : str
+        Activation function name.  Choices: 'relu', 'relu^2', 'gelu',
+        'swish', 'sigmoid', 'selu'.
+    ffn_dropout : float
+        Dropout applied inside the FFN of every layer except possibly the first
+        (see prelayers_dropout).
+    ffn_init : 'default' | 'scaled'
+        Weight initialisation scheme.  'scaled' uses small_init for the
+        up-projection and wang_init for the down-projection.
+    Neuron
+    ------
+    output_gate : bool
+        If True, the neuron output is element-wise gated by a learned
+        projection of the input.
+    train_init : bool
+        If True, the neuron's initial hidden state x_0 is a learned parameter.
+    neuron_dropout : float
+        Dropout probability applied to the neuron input.
+    Convolution
+    -----------
+    conv_type : 'gated' | 'basic'
+        'gated' (default) — learned scalar gate interpolating between
+        the previous and current token.  'basic' — learned linear mixing of
+        the previous and current token representations.
+    conv_init_val : float
+        Initial value of the gate logit in GatedConv.  0.0 → gate ≈ 0.5
+        (equal mix); negative values bias toward the current token.
+    Pre/Post-layers
+    -----------
+    prelayers_dropout : float
+        FFN dropout for the first layer only; overrides ffn_dropout.  Useful
+        as an input-level regulariser without penalising deeper layers.
+    use_postlayers_ffn : bool
+        If True, an extra FFN (with the same type and factor as the in-layer
+        FFN) is applied after all layers, before postlayers_norm.
+    Forward
+    -------
+    unroll_steps : int
+        Sequence chunk size for the forward pass.  The sequence is split into
+        chunks of this length and processed sequentially (carrying the state
+        across chunks).  unroll_steps=1 processes one token at a time;
+        unroll_steps=T processes the whole sequence at once.  Both give
+        identical outputs; larger values use more peak memory.
+    """
+    # Core architecture
+    d_model:  int
+    n_layers: int
+    d_state:  int
+    # Normalisation (within layers and post-layers)
+    norm:            NormType = 'layernorm'
+    postlayers_norm: NormType = 'layernorm'
+    # FFN within each layer
+    ffn_type:        FFType   = 'gated'
+    ffn_proj_factor: float    = 1.3
+    ffn_act_fn:      str      = 'relu'
+    ffn_dropout:     float    = 0.0
+    ffn_init:        InitType = 'scaled'
+    # Neuron
+    output_gate:     bool  = True
+    train_init:      bool  = False
+    neuron_dropout:  float = 0.0
+    # Conv
+    conv_type:       ConvType = 'gated'
+    conv_init_val:   float    = 0.0
+    # Per-layer options
+    prelayers_dropout:  float = 0.0
+    # Post-layers
+    use_postlayers_ffn: bool = False
+    @property
+    def layer_cfg(self) -> MinMaxLayerConfig:
+        neuron_cfg = MinMaxNeuronConfig(
+            _num_blocks = self.n_layers,
+            d_model     = self.d_model,
+            d_state     = self.d_state,
+            dropout     = self.neuron_dropout,
+            train_init  = self.train_init,
+            output_gate = self.output_gate,
+        )
+        if self.conv_type == 'basic':
+            conv_cfg = BasicConvConfig(embedding_dim=self.d_model)
+        else:
+            conv_cfg = GatedConvConfig(
+                embedding_dim = self.d_model,
+                init_val      = self.conv_init_val,
+            )
+        ffn_cfg = FeedForwardConfig(
+            _num_blocks  = self.n_layers,
+            ffn_type     = self.ffn_type,
+            proj_factor  = self.ffn_proj_factor,
+            act_fn       = self.ffn_act_fn,
+            dropout      = self.ffn_dropout,
+            init         = self.ffn_init,
+        )
+        return MinMaxLayerConfig(
+            d_model          = self.d_model,
+            neuron           = neuron_cfg,
+            conv             = conv_cfg,
+            feedforward      = ffn_cfg,
+            norm             = self.norm,
+            first_in_dropout = self.prelayers_dropout,
+        )
+    # ------------------------------------------------------------------
+    # Preset factories
+    # ------------------------------------------------------------------
+    @classmethod
+    def small(cls, n_layers: int = 2) -> 'MinMaxRNCConfig':
+        return cls(d_model=90, n_layers=n_layers, d_state=40)
+    @classmethod
+    def medium(cls, n_layers: int = 8) -> 'MinMaxRNCConfig':
+        return cls(d_model=512, n_layers=n_layers, d_state=512)
+    @classmethod
+    def large(cls, n_layers: int = 12) -> 'MinMaxRNCConfig':
+        return cls(d_model=728, n_layers=n_layers, d_state=1456)
+class MinMaxRNC(nn.Module):
+    """
+    MinMax Recurrent Neural Cascade — the backbone sequence model.
+    Stacks ``cfg.n_layers`` MinMaxLayers, each containing a short-range
+    convolution, a feed-forward network, and a MinMax Neuron.  All three
+    sub-layers use pre-norm and residual connections.
+    Inputs
+    ------
+    u : Tensor  (B, T, d_model)
+        Continuous input sequence (e.g. token embeddings).
+    state : list[dict] | None
+        Per-layer recurrent state from a previous call.  Pass None (or omit)
+        to start from the default initial state.
+    return_state : bool
+        If True, also return the updated state after the last token.
+    Outputs
+    -------
+    y : Tensor  (B, T, d_model)
+    state : list[dict]  — only when return_state=True
+    """
+    def __init__(self, cfg: MinMaxRNCConfig):
+        super().__init__()
+        self.__cfg = cfg
+        self.reset()
+    def reset(self):
+        layer_cfg = self.__cfg.layer_cfg
+        self.layers = nn.ModuleList()
+        firstlayer = True
+        for _ in range(self.__cfg.n_layers):
+            self.layers.append(MinMaxLayer(layer_cfg, first=firstlayer))
+            firstlayer = False
+        self.postlayers_norm = None
+        if self.__cfg.postlayers_norm == 'layernorm':
+            self.postlayers_norm = nn.LayerNorm(self.__cfg.d_model)
+        elif self.__cfg.postlayers_norm == 'rmsnorm':
+            self.postlayers_norm = nn.RMSNorm(self.__cfg.d_model)
+        self.postlayers_ffn      = None
+        self.postlayers_ffn_norm = None
+        if self.__cfg.use_postlayers_ffn:
+            self.postlayers_ffn = create_feedforward(
+                config=replace(
+                    layer_cfg.feedforward,
+                    embedding_dim     = self.__cfg.d_model,
+                    embedding_dim_out = self.__cfg.d_model,
+                )
+            )
+            if self.__cfg.norm == 'layernorm':
+                self.postlayers_ffn_norm = nn.LayerNorm(self.__cfg.d_model)
+            elif self.__cfg.norm == 'rmsnorm':
+                self.postlayers_ffn_norm = nn.RMSNorm(self.__cfg.d_model)
+            else:
+                self.postlayers_ffn_norm = nn.Identity()
+    @property
+    def initial_state(self):
+        return [layer.initial_state for layer in self.layers]
+    def _parallel_forward(self, u: torch.Tensor, state):
+        """u: [B, T, D] — returns output [B, T, D] and updated state."""
+        updated_state = []
+        y = u
+        for layer, layer_state in zip(self.layers, state):
+            y, updated_layer_state = layer(y, layer_state)
+            updated_state.append(updated_layer_state)
+        if self.postlayers_ffn is not None:
+            y = y + self.postlayers_ffn(self.postlayers_ffn_norm(y))
+        if self.postlayers_norm is not None:
+            y = self.postlayers_norm(y)
+        return y, updated_state
+    def forward(self, u: torch.Tensor,  unroll_steps: int, state=None, return_state: bool = False):
+        if state is None:
+            state = self.initial_state
+        y_chunks = []
+        for u_chunk in u.split(unroll_steps, dim=1):
+            y_chunk, state = self._parallel_forward(u_chunk, state)
+            y_chunks.append(y_chunk)
+        y = torch.cat(y_chunks, dim=1)
+        if return_state:
+            return y, state
+        return y

minmaxrnc/minmax_rnc_lm.py ADDED Viewed

@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: 2026 Alessandro Ronca
+# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+from .minmax_rnc             import MinMaxRNC, MinMaxRNCConfig
+from .modules.initialisers   import small_init_init_
+@dataclass(frozen=True)
+class MinMaxRNCLMConfig:
+    """
+    Configuration for the MinMax RNC language model.
+    Fields
+    ------
+    backbone : MinMaxRNCConfig
+        Config for the MinMaxRNC backbone. The embedding dimension is taken
+        from backbone.d_model.
+    head_dropout : float
+        Dropout applied to the backbone output before the LM head projection.
+    tie_weights : bool
+        If True (default), the LM head weight matrix is shared with the token
+        embedding matrix, halving those parameters and acting as a regulariser.
+    """
+    backbone:     MinMaxRNCConfig
+    head_dropout: float = 0.0
+    tie_weights:  bool  = True
+class MinMaxRNC_LM(MinMaxRNC):
+    """
+    MinMax RNC with a token embedding layer and a language-model head.
+    Wraps MinMaxRNC with:
+    - A token embedding  (vocab_size × d_model)
+    - A dropout before the output projection
+    - A linear LM head   (d_model × vocab_size), optionally tied to the embedding
+    Inputs
+    ------
+    tokens : LongTensor  (B, T)
+        Token indices in [0, vocab_size).
+    state : list[dict] | None
+        Recurrent state from a previous call.
+    return_state : bool
+        If True, also return the updated state.
+    Outputs
+    -------
+    logits : Tensor  (B, T, vocab_size)
+    state  : list[dict]  — only when return_state=True
+    """
+    def __init__(self, vocab_size: int, cfg: MinMaxRNCLMConfig):
+        self.__lm_cfg   = cfg
+        self.__vocab_size = vocab_size
+        super().__init__(cfg.backbone)   # calls reset() → MinMaxRNC.reset() then our additions
+        self.__lm_reset()
+    def reset(self):
+        super().reset()
+        self.__lm_reset()
+    def __lm_reset(self):
+        d_model = self.__lm_cfg.backbone.d_model
+        self.token_emb = nn.Embedding(self.__vocab_size, d_model)
+        self.lm_head   = nn.Linear(d_model, self.__vocab_size, bias=False)
+        self.head_drop = nn.Dropout(self.__lm_cfg.head_dropout)
+        small_init_init_(self.token_emb.weight, dim=d_model)
+        if self.__lm_cfg.tie_weights:
+            self.lm_head.weight = self.token_emb.weight
+        else:
+            small_init_init_(self.lm_head.weight, dim=d_model)
+    def forward(self, tokens: torch.Tensor, unroll_steps: int, state=None, return_state: bool = False):
+        y, state = super().forward(
+            self.token_emb(tokens), unroll_steps, state=state, return_state=True
+        )
+        logits = self.lm_head(self.head_drop(y))
+        if return_state:
+            return logits, state
+        return logits