PyPI - scratchkit - Versions diffs - 0.2.0__py3-none-any.whl - Mend

scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

mlscratch/__init__.py +56 -0
mlscratch/__main__.py +118 -0
mlscratch/bayesian/__init__.py +53 -0
mlscratch/bayesian/bayesian_linear_regression.py +171 -0
mlscratch/bayesian/bayesian_network.py +248 -0
mlscratch/bayesian/bayesian_nn.py +315 -0
mlscratch/bayesian/gaussian_process.py +207 -0
mlscratch/bayesian/hmm.py +277 -0
mlscratch/bayesian/init.py +52 -0
mlscratch/bayesian/kalman_filter.py +182 -0
mlscratch/bayesian/naive_bayes.py +209 -0
mlscratch/metrics/__init__.py +59 -0
mlscratch/metrics/classification.py +365 -0
mlscratch/metrics/regression.py +79 -0
mlscratch/neural/__init__.py +121 -0
mlscratch/neural/attention.py +420 -0
mlscratch/neural/autoencoder.py +543 -0
mlscratch/neural/boltzmann.py +231 -0
mlscratch/neural/cnn.py +593 -0
mlscratch/neural/cvnn.py +322 -0
mlscratch/neural/gan.py +364 -0
mlscratch/neural/hopfield.py +193 -0
mlscratch/neural/perceptron.py +398 -0
mlscratch/neural/rbf_network.py +230 -0
mlscratch/neural/recurrent.py +569 -0
mlscratch/preprocessing/__init__.py +38 -0
mlscratch/preprocessing/encoders.py +140 -0
mlscratch/preprocessing/model_selection.py +119 -0
mlscratch/preprocessing/polynomial.py +105 -0
mlscratch/preprocessing/scalers.py +220 -0
mlscratch/py.typed +0 -0
mlscratch/reinforcement/__init__.py +59 -0
mlscratch/reinforcement/ddpg.py +363 -0
mlscratch/reinforcement/dqn.py +319 -0
mlscratch/reinforcement/ppo.py +452 -0
mlscratch/reinforcement/q_learning.py +352 -0
mlscratch/reinforcement/sac.py +382 -0
mlscratch/reinforcement/utils.py +594 -0
mlscratch/supervised/__init__.py +76 -0
mlscratch/supervised/_validation.py +50 -0
mlscratch/supervised/adaboost.py +255 -0
mlscratch/supervised/decision_tree.py +495 -0
mlscratch/supervised/gradient_boosting.py +354 -0
mlscratch/supervised/knn.py +234 -0
mlscratch/supervised/lasso_regression.py +125 -0
mlscratch/supervised/linear_models.py +459 -0
mlscratch/supervised/linear_regression.py +197 -0
mlscratch/supervised/logistic_regression.py +119 -0
mlscratch/supervised/naive_bayes.py +113 -0
mlscratch/supervised/random_forest.py +321 -0
mlscratch/supervised/ridge_regression.py +93 -0
mlscratch/supervised/svm.py +356 -0
mlscratch/unsupervised/__init__.py +39 -0
mlscratch/unsupervised/apriori.py +178 -0
mlscratch/unsupervised/dbscan.py +141 -0
mlscratch/unsupervised/gmm.py +204 -0
mlscratch/unsupervised/hierarchical_clustering.py +137 -0
mlscratch/unsupervised/ica.py +167 -0
mlscratch/unsupervised/kmeans.py +135 -0
mlscratch/unsupervised/kmedoids.py +133 -0
mlscratch/unsupervised/pca.py +103 -0
mlscratch/unsupervised/tsne.py +200 -0
scratchkit-0.2.0.dist-info/METADATA +241 -0
scratchkit-0.2.0.dist-info/RECORD +68 -0
scratchkit-0.2.0.dist-info/WHEEL +5 -0
scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
scratchkit-0.2.0.dist-info/top_level.txt +1 -0

mlscratch/metrics/regression.py ADDED Viewed

@@ -0,0 +1,79 @@
+r"""
+Regression Metrics
+===================
+Evaluation metrics for regressors, implemented from scratch in pure numpy.
+.. math::
+    \mathrm{MSE} = \frac1n\sum_i (y_i-\hat y_i)^2, \qquad
+    \mathrm{RMSE} = \sqrt{\mathrm{MSE}}, \qquad
+    \mathrm{MAE} = \frac1n\sum_i |y_i-\hat y_i|
+.. math::
+    \mathrm{MAPE} = \frac1n\sum_i \left|\frac{y_i-\hat y_i}{y_i}\right|, \qquad
+    R^2 = 1 - \frac{\sum_i(y_i-\hat y_i)^2}{\sum_i(y_i-\bar y)^2}
+.. math::
+    \mathrm{ExplainedVariance} = 1 - \frac{\mathrm{Var}(y-\hat y)}{\mathrm{Var}(y)}
+"""
+from __future__ import annotations
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+_EPS = 1e-12
+def _validate(
+    y_true: ArrayLike, y_pred: ArrayLike
+) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
+    y_true_arr = np.asarray(y_true, dtype=np.float64).flatten()
+    y_pred_arr = np.asarray(y_pred, dtype=np.float64).flatten()
+    if y_true_arr.shape[0] != y_pred_arr.shape[0]:
+        raise ValueError(
+            f"y_true has {y_true_arr.shape[0]} samples but y_pred has {y_pred_arr.shape[0]}."
+        )
+    if y_true_arr.shape[0] == 0:
+        raise ValueError("y_true and y_pred must not be empty.")
+    return y_true_arr, y_pred_arr
+def mean_squared_error(y_true: ArrayLike, y_pred: ArrayLike, squared: bool = True) -> float:
+    """Mean squared error; pass ``squared=False`` for RMSE."""
+    y_true_arr, y_pred_arr = _validate(y_true, y_pred)
+    mse = float(np.mean((y_true_arr - y_pred_arr) ** 2))
+    return mse if squared else float(np.sqrt(mse))
+def root_mean_squared_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
+    """:math:`\\sqrt{\\mathrm{MSE}}`."""
+    return mean_squared_error(y_true, y_pred, squared=False)
+def mean_absolute_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
+    y_true_arr, y_pred_arr = _validate(y_true, y_pred)
+    return float(np.mean(np.abs(y_true_arr - y_pred_arr)))
+def mean_absolute_percentage_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
+    """Mean absolute percentage error. Entries with ``|y_true| < eps`` are
+    floored at ``eps`` to avoid division by zero, matching common practice."""
+    y_true_arr, y_pred_arr = _validate(y_true, y_pred)
+    denom = np.where(np.abs(y_true_arr) < _EPS, _EPS, np.abs(y_true_arr))
+    return float(np.mean(np.abs((y_true_arr - y_pred_arr) / denom)))
+def r2_score(y_true: ArrayLike, y_pred: ArrayLike) -> float:
+    """Coefficient of determination. Returns 0.0 (rather than NaN/inf) when
+    the target has zero variance, a common, well-documented convention."""
+    y_true_arr, y_pred_arr = _validate(y_true, y_pred)
+    ss_res = float(np.sum((y_true_arr - y_pred_arr) ** 2))
+    ss_tot = float(np.sum((y_true_arr - y_true_arr.mean()) ** 2))
+    return 1.0 - ss_res / ss_tot if ss_tot > _EPS else 0.0
+def explained_variance_score(y_true: ArrayLike, y_pred: ArrayLike) -> float:
+    y_true_arr, y_pred_arr = _validate(y_true, y_pred)
+    var_true = float(np.var(y_true_arr))
+    var_residual = float(np.var(y_true_arr - y_pred_arr))
+    return 1.0 - var_residual / var_true if var_true > _EPS else 0.0

mlscratch/neural/__init__.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""
+mlscratch.neural
+=================
+From-scratch implementations of neural network architectures.
+Pure NumPy — no PyTorch, no TensorFlow.
+Perceptrons
+-----------
+SingleLayerPerceptron   – binary classification or regression
+MultiLayerPerceptron    – feedforward network, classification or regression
+Autoencoders
+------------
+Autoencoder              – tied-weight vanilla autoencoder
+DenoisingAutoencoder     – trained on corrupted inputs (Gaussian / dropout noise)
+VariationalAutoencoder   – Gaussian latent space, reparameterisation trick
+Recurrent Networks
+-------------------
+SimpleRNN        – Elman RNN, classification/regression/feature-extractor
+LSTMCell         – single-timestep LSTM cell
+LSTM             – multi-layer LSTM, optional linear output head
+EncoderDecoder   – seq2seq RNN encoder-decoder
+Convolutional Networks
+------------------------
+Conv2D, MaxPool2D, AvgPool2D, BatchNorm2D, Flatten, Dense  – CNN building blocks
+SimpleCNN        – pre-wired conv → pool → conv → pool → dense → softmax
+Attention / Transformer
+--------------------------
+ScaledDotProductAttention
+MultiHeadAttention
+PositionalEncoding
+LayerNorm
+FeedForward
+TransformerEncoderLayer
+TransformerEncoder
+Generative Models
+-------------------
+Generator, Discriminator, GAN  – adversarial generative network
+Associative Memory
+--------------------
+HopfieldNetwork   – discrete bipolar associative memory
+Energy-Based Models
+----------------------
+RestrictedBoltzmannMachine  – RBM trained with Contrastive Divergence
+Radial Basis Function Networks
+---------------------------------
+RBFNetwork   – Gaussian RBF hidden layer + closed-form linear output
+Complex-Valued Networks
+---------------------------
+ComplexDense       – complex-valued fully-connected layer
+ComplexValuedNN    – multi-layer complex-valued feedforward network
+Note
+----
+Bayesian Neural Networks live in ``mlscratch.bayesian.bayesian_nn``
+(``BayesianNeuralNetwork``) since they are fundamentally a Bayesian
+inference method applied to a network architecture.
+"""
+from .perceptron import SingleLayerPerceptron, MultiLayerPerceptron      # noqa: F401
+from .autoencoder import (                                                # noqa: F401
+    Autoencoder,
+    DenoisingAutoencoder,
+    VariationalAutoencoder,
+)
+from .recurrent import SimpleRNN, LSTMCell, LSTM, EncoderDecoder          # noqa: F401
+from .cnn import (                                                         # noqa: F401
+    Conv2D,
+    MaxPool2D,
+    AvgPool2D,
+    BatchNorm2D,
+    Flatten,
+    Dense,
+    SimpleCNN,
+)
+from .attention import (                                                   # noqa: F401
+    ScaledDotProductAttention,
+    MultiHeadAttention,
+    PositionalEncoding,
+    LayerNorm,
+    FeedForward,
+    TransformerEncoderLayer,
+    TransformerEncoder,
+)
+from .gan import Generator, Discriminator, GAN                            # noqa: F401
+from .hopfield import HopfieldNetwork                                     # noqa: F401
+from .boltzmann import RestrictedBoltzmannMachine                         # noqa: F401
+from .rbf_network import RBFNetwork                                       # noqa: F401
+from .cvnn import ComplexDense, ComplexValuedNN                           # noqa: F401
+__all__ = [
+    # Perceptrons
+    "SingleLayerPerceptron", "MultiLayerPerceptron",
+    # Autoencoders
+    "Autoencoder", "DenoisingAutoencoder", "VariationalAutoencoder",
+    # Recurrent
+    "SimpleRNN", "LSTMCell", "LSTM", "EncoderDecoder",
+    # CNN
+    "Conv2D", "MaxPool2D", "AvgPool2D", "BatchNorm2D", "Flatten", "Dense", "SimpleCNN",
+    # Attention / Transformer
+    "ScaledDotProductAttention", "MultiHeadAttention", "PositionalEncoding",
+    "LayerNorm", "FeedForward", "TransformerEncoderLayer", "TransformerEncoder",
+    # GAN
+    "Generator", "Discriminator", "GAN",
+    # Associative memory
+    "HopfieldNetwork",
+    # Energy-based
+    "RestrictedBoltzmannMachine",
+    # RBF
+    "RBFNetwork",
+    # Complex-valued
+    "ComplexDense", "ComplexValuedNN",
+]

mlscratch/neural/attention.py ADDED Viewed

@@ -0,0 +1,420 @@
+"""
+Attention Mechanisms and Transformer
+======================================
+Building blocks of the Transformer architecture (Vaswani et al., 2017).
+ScaledDotProductAttention
+--------------------------
+The core attention operation:
+    Attention(Q, K, V) = softmax(QK^T / √d_k) V
+MultiHeadAttention
+-------------------
+Splits Q, K, V into ``n_heads`` parallel attention computations,
+concatenates results, and projects back to ``d_model``:
+    head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
+    MHA(Q,K,V) = Concat(head_1, ..., head_h) W^O
+PositionalEncoding
+-------------------
+Injects order information using sinusoids of varying frequency:
+    PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
+    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
+LayerNorm
+---------
+Normalises across the feature dimension:
+    LN(x) = γ (x - μ) / √(σ² + ε) + β
+FeedForward
+-----------
+Two-layer MLP with ReLU, applied position-wise:
+    FFN(x) = ReLU(x W1 + b1) W2 + b2
+TransformerEncoderLayer / TransformerEncoder
+----------------------------------------------
+Standard encoder block: MHA → Add&Norm → FFN → Add&Norm, stacked
+``n_layers`` times.
+References
+----------
+Vaswani et al. (2017). Attention is all you need. NeurIPS.
+Only numpy is used.
+"""
+from __future__ import annotations
+import numpy as np
+# ============================================================
+# Helpers
+# ============================================================
+def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    e = np.exp(x - x.max(axis=axis, keepdims=True))
+    return e / e.sum(axis=axis, keepdims=True)
+def _relu(x: np.ndarray) -> np.ndarray:
+    return np.maximum(0.0, x)
+# ============================================================
+# Scaled Dot-Product Attention
+# ============================================================
+class ScaledDotProductAttention:
+    """
+    Scaled Dot-Product Attention (stateless — no learnable parameters).
+    Attention(Q, K, V) = softmax(QK^T / √d_k + mask) V
+    """
+    def __call__(
+        self,
+        Q: np.ndarray,
+        K: np.ndarray,
+        V: np.ndarray,
+        mask: np.ndarray | None = None,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Parameters
+        ----------
+        Q : (..., seq_len_q, d_k)
+        K : (..., seq_len_k, d_k)
+        V : (..., seq_len_k, d_v)
+        mask : (..., seq_len_q, seq_len_k) or None
+            Positions with mask == 0 are set to -inf before softmax
+            (used for causal / padding masks).
+        Returns
+        -------
+        output : (..., seq_len_q, d_v)
+        attn_weights : (..., seq_len_q, seq_len_k)
+        """
+        d_k = Q.shape[-1]
+        scores = Q @ np.swapaxes(K, -1, -2) / np.sqrt(d_k)
+        if mask is not None:
+            # Squeeze extra leading dims so mask broadcasts correctly against scores
+            m = mask
+            while m.ndim > scores.ndim:
+                m = m.squeeze(0)
+            scores = np.where(m == 0, -1e9, scores)
+        attn_weights = _softmax(scores, axis=-1)
+        output = attn_weights @ V
+        return output, attn_weights
+# ============================================================
+# Multi-Head Attention
+# ============================================================
+class MultiHeadAttention:
+    """
+    Multi-Head Attention with learnable projection matrices.
+    Parameters
+    ----------
+    d_model : int
+        Input/output feature dimension.
+    n_heads : int
+        Number of attention heads.  Must evenly divide d_model.
+    random_state : int or None
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        random_state: int | None = None,
+    ) -> None:
+        if d_model % n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads.")
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k     = d_model // n_heads
+        rng   = np.random.default_rng(random_state)
+        scale = np.sqrt(2.0 / d_model)
+        self.W_q = rng.normal(0, scale, (d_model, d_model))
+        self.W_k = rng.normal(0, scale, (d_model, d_model))
+        self.W_v = rng.normal(0, scale, (d_model, d_model))
+        self.W_o = rng.normal(0, scale, (d_model, d_model))
+        self._attn = ScaledDotProductAttention()
+        self.last_attn_weights_: np.ndarray | None = None
+    def _split_heads(self, x: np.ndarray) -> np.ndarray:
+        """(B, T, d_model) → (B, n_heads, T, d_k)"""
+        B, T, _ = x.shape
+        x = x.reshape(B, T, self.n_heads, self.d_k)
+        return x.transpose(0, 2, 1, 3)
+    def _combine_heads(self, x: np.ndarray) -> np.ndarray:
+        """(B, n_heads, T, d_k) → (B, T, d_model)"""
+        B, H, T, d_k = x.shape
+        x = x.transpose(0, 2, 1, 3)
+        return x.reshape(B, T, H * d_k)
+    def forward(
+        self,
+        x: np.ndarray,
+        mask: np.ndarray | None = None,
+    ) -> np.ndarray:
+        """
+        Self-attention (Q=K=V=x).
+        Parameters
+        ----------
+        x : (B, T, d_model)
+        mask : (B, 1, T, T) or None
+        Returns
+        -------
+        out : (B, T, d_model)
+        """
+        Q = self._split_heads(x @ self.W_q)
+        K = self._split_heads(x @ self.W_k)
+        V = self._split_heads(x @ self.W_v)
+        attn_out, attn_weights = self._attn(Q, K, V, mask)
+        self.last_attn_weights_ = attn_weights
+        combined = self._combine_heads(attn_out)
+        return combined @ self.W_o
+# ============================================================
+# Positional Encoding
+# ============================================================
+class PositionalEncoding:
+    """
+    Sinusoidal positional encoding (no learnable parameters).
+    Parameters
+    ----------
+    d_model : int
+    max_len : int
+        Maximum supported sequence length.
+    """
+    def __init__(self, d_model: int, max_len: int = 512) -> None:
+        self.d_model = d_model
+        self.max_len = max_len
+        self.pe = self._build(d_model, max_len)
+    @staticmethod
+    def _build(d_model: int, max_len: int) -> np.ndarray:
+        position = np.arange(max_len)[:, np.newaxis]                    # (max_len, 1)
+        div_term = np.exp(
+            np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)
+        )                                                                # (d_model/2,)
+        pe = np.zeros((max_len, d_model))
+        pe[:, 0::2] = np.sin(position * div_term)
+        pe[:, 1::2] = np.cos(position * div_term)
+        return pe
+    def forward(self, x: np.ndarray) -> np.ndarray:
+        """
+        Add positional encoding to x.
+        Parameters
+        ----------
+        x : (B, T, d_model)  or  (T, d_model)
+        Returns
+        -------
+        same shape as x
+        """
+        T = x.shape[-2]
+        if T > self.max_len:
+            raise ValueError(f"Sequence length {T} exceeds max_len={self.max_len}.")
+        return x + self.pe[:T]
+# ============================================================
+# LayerNorm
+# ============================================================
+class LayerNorm:
+    """
+    Layer Normalisation over the last dimension.
+    Parameters
+    ----------
+    d_model : int
+    eps : float
+    """
+    def __init__(self, d_model: int, eps: float = 1e-6) -> None:
+        self.gamma = np.ones(d_model)
+        self.beta  = np.zeros(d_model)
+        self.eps   = eps
+    def forward(self, x: np.ndarray) -> np.ndarray:
+        """x : (..., d_model)"""
+        mean = x.mean(axis=-1, keepdims=True)
+        var  = x.var(axis=-1, keepdims=True)
+        x_hat = (x - mean) / np.sqrt(var + self.eps)
+        return self.gamma * x_hat + self.beta
+# ============================================================
+# Feed-Forward Network
+# ============================================================
+class FeedForward:
+    """
+    Position-wise feed-forward network: Linear → ReLU → Linear.
+    Parameters
+    ----------
+    d_model : int
+    d_ff : int
+        Hidden layer size (typically 4 × d_model).
+    random_state : int or None
+    """
+    def __init__(
+        self,
+        d_model: int,
+        d_ff: int = 256,
+        random_state: int | None = None,
+    ) -> None:
+        rng = np.random.default_rng(random_state)
+        self.W1 = rng.normal(0, np.sqrt(2.0 / d_model), (d_model, d_ff))
+        self.b1 = np.zeros(d_ff)
+        self.W2 = rng.normal(0, np.sqrt(2.0 / d_ff), (d_ff, d_model))
+        self.b2 = np.zeros(d_model)
+    def forward(self, x: np.ndarray) -> np.ndarray:
+        """x : (..., d_model) → (..., d_model)"""
+        h = _relu(x @ self.W1 + self.b1)
+        return h @ self.W2 + self.b2
+# ============================================================
+# Transformer Encoder Layer
+# ============================================================
+class TransformerEncoderLayer:
+    """
+    A single Transformer encoder layer:
+        x  = LayerNorm(x + MultiHeadAttention(x))
+        x  = LayerNorm(x + FeedForward(x))
+    Parameters
+    ----------
+    d_model : int
+    n_heads : int
+    d_ff : int
+    random_state : int or None
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        d_ff: int = 256,
+        random_state: int | None = None,
+    ) -> None:
+        self.attn  = MultiHeadAttention(d_model, n_heads, random_state)
+        self.ffn   = FeedForward(d_model, d_ff, random_state)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+    def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray:
+        """x : (B, T, d_model) → (B, T, d_model)"""
+        attn_out = self.attn.forward(x, mask)
+        x = self.norm1.forward(x + attn_out)
+        ffn_out = self.ffn.forward(x)
+        x = self.norm2.forward(x + ffn_out)
+        return x
+# ============================================================
+# Transformer Encoder (stack of layers)
+# ============================================================
+class TransformerEncoder:
+    """
+    Stack of TransformerEncoderLayer with input embedding + positional
+    encoding.
+    Parameters
+    ----------
+    vocab_size : int
+        Size of the input vocabulary (for the embedding lookup).
+    d_model : int
+    n_heads : int
+    n_layers : int
+    d_ff : int
+    max_len : int
+    random_state : int or None
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int,
+        n_heads: int,
+        n_layers: int = 2,
+        d_ff: int = 256,
+        max_len: int = 512,
+        random_state: int | None = None,
+    ) -> None:
+        rng = np.random.default_rng(random_state)
+        self.d_model    = d_model
+        self.embedding  = rng.normal(0, 0.02, (vocab_size, d_model))
+        self.pos_enc    = PositionalEncoding(d_model, max_len)
+        self.layers = [
+            TransformerEncoderLayer(d_model, n_heads, d_ff,
+                                    (random_state or 0) + i)
+            for i in range(n_layers)
+        ]
+    def forward(
+        self,
+        token_ids: np.ndarray,
+        mask: np.ndarray | None = None,
+    ) -> np.ndarray:
+        """
+        Parameters
+        ----------
+        token_ids : (B, T) integer token indices
+        mask : (B, 1, T, T) or None
+        Returns
+        -------
+        out : (B, T, d_model)
+        """
+        x = self.embedding[token_ids]                     # (B, T, d_model)
+        x = x * np.sqrt(self.d_model)                      # scale embeddings
+        x = self.pos_enc.forward(x)
+        for layer in self.layers:
+            x = layer.forward(x, mask)
+        return x
+    @staticmethod
+    def causal_mask(seq_len: int) -> np.ndarray:
+        """
+        Build a causal (look-ahead) mask of shape (1, 1, T, T)
+        where position i can attend to positions <= i.
+        """
+        mask = np.tril(np.ones((seq_len, seq_len)))
+        return mask[np.newaxis, np.newaxis, :, :]