scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. mlscratch/__init__.py +56 -0
  2. mlscratch/__main__.py +118 -0
  3. mlscratch/bayesian/__init__.py +53 -0
  4. mlscratch/bayesian/bayesian_linear_regression.py +171 -0
  5. mlscratch/bayesian/bayesian_network.py +248 -0
  6. mlscratch/bayesian/bayesian_nn.py +315 -0
  7. mlscratch/bayesian/gaussian_process.py +207 -0
  8. mlscratch/bayesian/hmm.py +277 -0
  9. mlscratch/bayesian/init.py +52 -0
  10. mlscratch/bayesian/kalman_filter.py +182 -0
  11. mlscratch/bayesian/naive_bayes.py +209 -0
  12. mlscratch/metrics/__init__.py +59 -0
  13. mlscratch/metrics/classification.py +365 -0
  14. mlscratch/metrics/regression.py +79 -0
  15. mlscratch/neural/__init__.py +121 -0
  16. mlscratch/neural/attention.py +420 -0
  17. mlscratch/neural/autoencoder.py +543 -0
  18. mlscratch/neural/boltzmann.py +231 -0
  19. mlscratch/neural/cnn.py +593 -0
  20. mlscratch/neural/cvnn.py +322 -0
  21. mlscratch/neural/gan.py +364 -0
  22. mlscratch/neural/hopfield.py +193 -0
  23. mlscratch/neural/perceptron.py +398 -0
  24. mlscratch/neural/rbf_network.py +230 -0
  25. mlscratch/neural/recurrent.py +569 -0
  26. mlscratch/preprocessing/__init__.py +38 -0
  27. mlscratch/preprocessing/encoders.py +140 -0
  28. mlscratch/preprocessing/model_selection.py +119 -0
  29. mlscratch/preprocessing/polynomial.py +105 -0
  30. mlscratch/preprocessing/scalers.py +220 -0
  31. mlscratch/py.typed +0 -0
  32. mlscratch/reinforcement/__init__.py +59 -0
  33. mlscratch/reinforcement/ddpg.py +363 -0
  34. mlscratch/reinforcement/dqn.py +319 -0
  35. mlscratch/reinforcement/ppo.py +452 -0
  36. mlscratch/reinforcement/q_learning.py +352 -0
  37. mlscratch/reinforcement/sac.py +382 -0
  38. mlscratch/reinforcement/utils.py +594 -0
  39. mlscratch/supervised/__init__.py +76 -0
  40. mlscratch/supervised/_validation.py +50 -0
  41. mlscratch/supervised/adaboost.py +255 -0
  42. mlscratch/supervised/decision_tree.py +495 -0
  43. mlscratch/supervised/gradient_boosting.py +354 -0
  44. mlscratch/supervised/knn.py +234 -0
  45. mlscratch/supervised/lasso_regression.py +125 -0
  46. mlscratch/supervised/linear_models.py +459 -0
  47. mlscratch/supervised/linear_regression.py +197 -0
  48. mlscratch/supervised/logistic_regression.py +119 -0
  49. mlscratch/supervised/naive_bayes.py +113 -0
  50. mlscratch/supervised/random_forest.py +321 -0
  51. mlscratch/supervised/ridge_regression.py +93 -0
  52. mlscratch/supervised/svm.py +356 -0
  53. mlscratch/unsupervised/__init__.py +39 -0
  54. mlscratch/unsupervised/apriori.py +178 -0
  55. mlscratch/unsupervised/dbscan.py +141 -0
  56. mlscratch/unsupervised/gmm.py +204 -0
  57. mlscratch/unsupervised/hierarchical_clustering.py +137 -0
  58. mlscratch/unsupervised/ica.py +167 -0
  59. mlscratch/unsupervised/kmeans.py +135 -0
  60. mlscratch/unsupervised/kmedoids.py +133 -0
  61. mlscratch/unsupervised/pca.py +103 -0
  62. mlscratch/unsupervised/tsne.py +200 -0
  63. scratchkit-0.2.0.dist-info/METADATA +241 -0
  64. scratchkit-0.2.0.dist-info/RECORD +68 -0
  65. scratchkit-0.2.0.dist-info/WHEEL +5 -0
  66. scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
  67. scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
  68. scratchkit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,79 @@
1
+ r"""
2
+ Regression Metrics
3
+ ===================
4
+ Evaluation metrics for regressors, implemented from scratch in pure numpy.
5
+
6
+ .. math::
7
+ \mathrm{MSE} = \frac1n\sum_i (y_i-\hat y_i)^2, \qquad
8
+ \mathrm{RMSE} = \sqrt{\mathrm{MSE}}, \qquad
9
+ \mathrm{MAE} = \frac1n\sum_i |y_i-\hat y_i|
10
+
11
+ .. math::
12
+ \mathrm{MAPE} = \frac1n\sum_i \left|\frac{y_i-\hat y_i}{y_i}\right|, \qquad
13
+ R^2 = 1 - \frac{\sum_i(y_i-\hat y_i)^2}{\sum_i(y_i-\bar y)^2}
14
+
15
+ .. math::
16
+ \mathrm{ExplainedVariance} = 1 - \frac{\mathrm{Var}(y-\hat y)}{\mathrm{Var}(y)}
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import numpy as np
22
+ from numpy.typing import ArrayLike, NDArray
23
+
24
+ _EPS = 1e-12
25
+
26
+
27
+ def _validate(
28
+ y_true: ArrayLike, y_pred: ArrayLike
29
+ ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
30
+ y_true_arr = np.asarray(y_true, dtype=np.float64).flatten()
31
+ y_pred_arr = np.asarray(y_pred, dtype=np.float64).flatten()
32
+ if y_true_arr.shape[0] != y_pred_arr.shape[0]:
33
+ raise ValueError(
34
+ f"y_true has {y_true_arr.shape[0]} samples but y_pred has {y_pred_arr.shape[0]}."
35
+ )
36
+ if y_true_arr.shape[0] == 0:
37
+ raise ValueError("y_true and y_pred must not be empty.")
38
+ return y_true_arr, y_pred_arr
39
+
40
+
41
+ def mean_squared_error(y_true: ArrayLike, y_pred: ArrayLike, squared: bool = True) -> float:
42
+ """Mean squared error; pass ``squared=False`` for RMSE."""
43
+ y_true_arr, y_pred_arr = _validate(y_true, y_pred)
44
+ mse = float(np.mean((y_true_arr - y_pred_arr) ** 2))
45
+ return mse if squared else float(np.sqrt(mse))
46
+
47
+
48
+ def root_mean_squared_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
49
+ """:math:`\\sqrt{\\mathrm{MSE}}`."""
50
+ return mean_squared_error(y_true, y_pred, squared=False)
51
+
52
+
53
+ def mean_absolute_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
54
+ y_true_arr, y_pred_arr = _validate(y_true, y_pred)
55
+ return float(np.mean(np.abs(y_true_arr - y_pred_arr)))
56
+
57
+
58
+ def mean_absolute_percentage_error(y_true: ArrayLike, y_pred: ArrayLike) -> float:
59
+ """Mean absolute percentage error. Entries with ``|y_true| < eps`` are
60
+ floored at ``eps`` to avoid division by zero, matching common practice."""
61
+ y_true_arr, y_pred_arr = _validate(y_true, y_pred)
62
+ denom = np.where(np.abs(y_true_arr) < _EPS, _EPS, np.abs(y_true_arr))
63
+ return float(np.mean(np.abs((y_true_arr - y_pred_arr) / denom)))
64
+
65
+
66
+ def r2_score(y_true: ArrayLike, y_pred: ArrayLike) -> float:
67
+ """Coefficient of determination. Returns 0.0 (rather than NaN/inf) when
68
+ the target has zero variance, a common, well-documented convention."""
69
+ y_true_arr, y_pred_arr = _validate(y_true, y_pred)
70
+ ss_res = float(np.sum((y_true_arr - y_pred_arr) ** 2))
71
+ ss_tot = float(np.sum((y_true_arr - y_true_arr.mean()) ** 2))
72
+ return 1.0 - ss_res / ss_tot if ss_tot > _EPS else 0.0
73
+
74
+
75
+ def explained_variance_score(y_true: ArrayLike, y_pred: ArrayLike) -> float:
76
+ y_true_arr, y_pred_arr = _validate(y_true, y_pred)
77
+ var_true = float(np.var(y_true_arr))
78
+ var_residual = float(np.var(y_true_arr - y_pred_arr))
79
+ return 1.0 - var_residual / var_true if var_true > _EPS else 0.0
@@ -0,0 +1,121 @@
1
+ """
2
+ mlscratch.neural
3
+ =================
4
+ From-scratch implementations of neural network architectures.
5
+ Pure NumPy — no PyTorch, no TensorFlow.
6
+
7
+ Perceptrons
8
+ -----------
9
+ SingleLayerPerceptron – binary classification or regression
10
+ MultiLayerPerceptron – feedforward network, classification or regression
11
+
12
+ Autoencoders
13
+ ------------
14
+ Autoencoder – tied-weight vanilla autoencoder
15
+ DenoisingAutoencoder – trained on corrupted inputs (Gaussian / dropout noise)
16
+ VariationalAutoencoder – Gaussian latent space, reparameterisation trick
17
+
18
+ Recurrent Networks
19
+ -------------------
20
+ SimpleRNN – Elman RNN, classification/regression/feature-extractor
21
+ LSTMCell – single-timestep LSTM cell
22
+ LSTM – multi-layer LSTM, optional linear output head
23
+ EncoderDecoder – seq2seq RNN encoder-decoder
24
+
25
+ Convolutional Networks
26
+ ------------------------
27
+ Conv2D, MaxPool2D, AvgPool2D, BatchNorm2D, Flatten, Dense – CNN building blocks
28
+ SimpleCNN – pre-wired conv → pool → conv → pool → dense → softmax
29
+
30
+ Attention / Transformer
31
+ --------------------------
32
+ ScaledDotProductAttention
33
+ MultiHeadAttention
34
+ PositionalEncoding
35
+ LayerNorm
36
+ FeedForward
37
+ TransformerEncoderLayer
38
+ TransformerEncoder
39
+
40
+ Generative Models
41
+ -------------------
42
+ Generator, Discriminator, GAN – adversarial generative network
43
+
44
+ Associative Memory
45
+ --------------------
46
+ HopfieldNetwork – discrete bipolar associative memory
47
+
48
+ Energy-Based Models
49
+ ----------------------
50
+ RestrictedBoltzmannMachine – RBM trained with Contrastive Divergence
51
+
52
+ Radial Basis Function Networks
53
+ ---------------------------------
54
+ RBFNetwork – Gaussian RBF hidden layer + closed-form linear output
55
+
56
+ Complex-Valued Networks
57
+ ---------------------------
58
+ ComplexDense – complex-valued fully-connected layer
59
+ ComplexValuedNN – multi-layer complex-valued feedforward network
60
+
61
+ Note
62
+ ----
63
+ Bayesian Neural Networks live in ``mlscratch.bayesian.bayesian_nn``
64
+ (``BayesianNeuralNetwork``) since they are fundamentally a Bayesian
65
+ inference method applied to a network architecture.
66
+ """
67
+
68
+ from .perceptron import SingleLayerPerceptron, MultiLayerPerceptron # noqa: F401
69
+ from .autoencoder import ( # noqa: F401
70
+ Autoencoder,
71
+ DenoisingAutoencoder,
72
+ VariationalAutoencoder,
73
+ )
74
+ from .recurrent import SimpleRNN, LSTMCell, LSTM, EncoderDecoder # noqa: F401
75
+ from .cnn import ( # noqa: F401
76
+ Conv2D,
77
+ MaxPool2D,
78
+ AvgPool2D,
79
+ BatchNorm2D,
80
+ Flatten,
81
+ Dense,
82
+ SimpleCNN,
83
+ )
84
+ from .attention import ( # noqa: F401
85
+ ScaledDotProductAttention,
86
+ MultiHeadAttention,
87
+ PositionalEncoding,
88
+ LayerNorm,
89
+ FeedForward,
90
+ TransformerEncoderLayer,
91
+ TransformerEncoder,
92
+ )
93
+ from .gan import Generator, Discriminator, GAN # noqa: F401
94
+ from .hopfield import HopfieldNetwork # noqa: F401
95
+ from .boltzmann import RestrictedBoltzmannMachine # noqa: F401
96
+ from .rbf_network import RBFNetwork # noqa: F401
97
+ from .cvnn import ComplexDense, ComplexValuedNN # noqa: F401
98
+
99
+ __all__ = [
100
+ # Perceptrons
101
+ "SingleLayerPerceptron", "MultiLayerPerceptron",
102
+ # Autoencoders
103
+ "Autoencoder", "DenoisingAutoencoder", "VariationalAutoencoder",
104
+ # Recurrent
105
+ "SimpleRNN", "LSTMCell", "LSTM", "EncoderDecoder",
106
+ # CNN
107
+ "Conv2D", "MaxPool2D", "AvgPool2D", "BatchNorm2D", "Flatten", "Dense", "SimpleCNN",
108
+ # Attention / Transformer
109
+ "ScaledDotProductAttention", "MultiHeadAttention", "PositionalEncoding",
110
+ "LayerNorm", "FeedForward", "TransformerEncoderLayer", "TransformerEncoder",
111
+ # GAN
112
+ "Generator", "Discriminator", "GAN",
113
+ # Associative memory
114
+ "HopfieldNetwork",
115
+ # Energy-based
116
+ "RestrictedBoltzmannMachine",
117
+ # RBF
118
+ "RBFNetwork",
119
+ # Complex-valued
120
+ "ComplexDense", "ComplexValuedNN",
121
+ ]
@@ -0,0 +1,420 @@
1
+ """
2
+ Attention Mechanisms and Transformer
3
+ ======================================
4
+ Building blocks of the Transformer architecture (Vaswani et al., 2017).
5
+
6
+ ScaledDotProductAttention
7
+ --------------------------
8
+ The core attention operation:
9
+
10
+ Attention(Q, K, V) = softmax(QK^T / √d_k) V
11
+
12
+ MultiHeadAttention
13
+ -------------------
14
+ Splits Q, K, V into ``n_heads`` parallel attention computations,
15
+ concatenates results, and projects back to ``d_model``:
16
+
17
+ head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
18
+ MHA(Q,K,V) = Concat(head_1, ..., head_h) W^O
19
+
20
+ PositionalEncoding
21
+ -------------------
22
+ Injects order information using sinusoids of varying frequency:
23
+
24
+ PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
25
+ PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
26
+
27
+ LayerNorm
28
+ ---------
29
+ Normalises across the feature dimension:
30
+
31
+ LN(x) = γ (x - μ) / √(σ² + ε) + β
32
+
33
+ FeedForward
34
+ -----------
35
+ Two-layer MLP with ReLU, applied position-wise:
36
+
37
+ FFN(x) = ReLU(x W1 + b1) W2 + b2
38
+
39
+ TransformerEncoderLayer / TransformerEncoder
40
+ ----------------------------------------------
41
+ Standard encoder block: MHA → Add&Norm → FFN → Add&Norm, stacked
42
+ ``n_layers`` times.
43
+
44
+ References
45
+ ----------
46
+ Vaswani et al. (2017). Attention is all you need. NeurIPS.
47
+
48
+ Only numpy is used.
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import numpy as np
54
+
55
+
56
+ # ============================================================
57
+ # Helpers
58
+ # ============================================================
59
+
60
+ def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
61
+ e = np.exp(x - x.max(axis=axis, keepdims=True))
62
+ return e / e.sum(axis=axis, keepdims=True)
63
+
64
+
65
+ def _relu(x: np.ndarray) -> np.ndarray:
66
+ return np.maximum(0.0, x)
67
+
68
+
69
+ # ============================================================
70
+ # Scaled Dot-Product Attention
71
+ # ============================================================
72
+
73
+ class ScaledDotProductAttention:
74
+ """
75
+ Scaled Dot-Product Attention (stateless — no learnable parameters).
76
+
77
+ Attention(Q, K, V) = softmax(QK^T / √d_k + mask) V
78
+ """
79
+
80
+ def __call__(
81
+ self,
82
+ Q: np.ndarray,
83
+ K: np.ndarray,
84
+ V: np.ndarray,
85
+ mask: np.ndarray | None = None,
86
+ ) -> tuple[np.ndarray, np.ndarray]:
87
+ """
88
+ Parameters
89
+ ----------
90
+ Q : (..., seq_len_q, d_k)
91
+ K : (..., seq_len_k, d_k)
92
+ V : (..., seq_len_k, d_v)
93
+ mask : (..., seq_len_q, seq_len_k) or None
94
+ Positions with mask == 0 are set to -inf before softmax
95
+ (used for causal / padding masks).
96
+
97
+ Returns
98
+ -------
99
+ output : (..., seq_len_q, d_v)
100
+ attn_weights : (..., seq_len_q, seq_len_k)
101
+ """
102
+ d_k = Q.shape[-1]
103
+ scores = Q @ np.swapaxes(K, -1, -2) / np.sqrt(d_k)
104
+
105
+ if mask is not None:
106
+ # Squeeze extra leading dims so mask broadcasts correctly against scores
107
+ m = mask
108
+ while m.ndim > scores.ndim:
109
+ m = m.squeeze(0)
110
+ scores = np.where(m == 0, -1e9, scores)
111
+
112
+ attn_weights = _softmax(scores, axis=-1)
113
+ output = attn_weights @ V
114
+ return output, attn_weights
115
+
116
+
117
+ # ============================================================
118
+ # Multi-Head Attention
119
+ # ============================================================
120
+
121
+ class MultiHeadAttention:
122
+ """
123
+ Multi-Head Attention with learnable projection matrices.
124
+
125
+ Parameters
126
+ ----------
127
+ d_model : int
128
+ Input/output feature dimension.
129
+ n_heads : int
130
+ Number of attention heads. Must evenly divide d_model.
131
+ random_state : int or None
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ d_model: int,
137
+ n_heads: int,
138
+ random_state: int | None = None,
139
+ ) -> None:
140
+ if d_model % n_heads != 0:
141
+ raise ValueError("d_model must be divisible by n_heads.")
142
+ self.d_model = d_model
143
+ self.n_heads = n_heads
144
+ self.d_k = d_model // n_heads
145
+
146
+ rng = np.random.default_rng(random_state)
147
+ scale = np.sqrt(2.0 / d_model)
148
+ self.W_q = rng.normal(0, scale, (d_model, d_model))
149
+ self.W_k = rng.normal(0, scale, (d_model, d_model))
150
+ self.W_v = rng.normal(0, scale, (d_model, d_model))
151
+ self.W_o = rng.normal(0, scale, (d_model, d_model))
152
+
153
+ self._attn = ScaledDotProductAttention()
154
+ self.last_attn_weights_: np.ndarray | None = None
155
+
156
+ def _split_heads(self, x: np.ndarray) -> np.ndarray:
157
+ """(B, T, d_model) → (B, n_heads, T, d_k)"""
158
+ B, T, _ = x.shape
159
+ x = x.reshape(B, T, self.n_heads, self.d_k)
160
+ return x.transpose(0, 2, 1, 3)
161
+
162
+ def _combine_heads(self, x: np.ndarray) -> np.ndarray:
163
+ """(B, n_heads, T, d_k) → (B, T, d_model)"""
164
+ B, H, T, d_k = x.shape
165
+ x = x.transpose(0, 2, 1, 3)
166
+ return x.reshape(B, T, H * d_k)
167
+
168
+ def forward(
169
+ self,
170
+ x: np.ndarray,
171
+ mask: np.ndarray | None = None,
172
+ ) -> np.ndarray:
173
+ """
174
+ Self-attention (Q=K=V=x).
175
+
176
+ Parameters
177
+ ----------
178
+ x : (B, T, d_model)
179
+ mask : (B, 1, T, T) or None
180
+
181
+ Returns
182
+ -------
183
+ out : (B, T, d_model)
184
+ """
185
+ Q = self._split_heads(x @ self.W_q)
186
+ K = self._split_heads(x @ self.W_k)
187
+ V = self._split_heads(x @ self.W_v)
188
+
189
+ attn_out, attn_weights = self._attn(Q, K, V, mask)
190
+ self.last_attn_weights_ = attn_weights
191
+
192
+ combined = self._combine_heads(attn_out)
193
+ return combined @ self.W_o
194
+
195
+
196
+ # ============================================================
197
+ # Positional Encoding
198
+ # ============================================================
199
+
200
+ class PositionalEncoding:
201
+ """
202
+ Sinusoidal positional encoding (no learnable parameters).
203
+
204
+ Parameters
205
+ ----------
206
+ d_model : int
207
+ max_len : int
208
+ Maximum supported sequence length.
209
+ """
210
+
211
+ def __init__(self, d_model: int, max_len: int = 512) -> None:
212
+ self.d_model = d_model
213
+ self.max_len = max_len
214
+ self.pe = self._build(d_model, max_len)
215
+
216
+ @staticmethod
217
+ def _build(d_model: int, max_len: int) -> np.ndarray:
218
+ position = np.arange(max_len)[:, np.newaxis] # (max_len, 1)
219
+ div_term = np.exp(
220
+ np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)
221
+ ) # (d_model/2,)
222
+ pe = np.zeros((max_len, d_model))
223
+ pe[:, 0::2] = np.sin(position * div_term)
224
+ pe[:, 1::2] = np.cos(position * div_term)
225
+ return pe
226
+
227
+ def forward(self, x: np.ndarray) -> np.ndarray:
228
+ """
229
+ Add positional encoding to x.
230
+
231
+ Parameters
232
+ ----------
233
+ x : (B, T, d_model) or (T, d_model)
234
+
235
+ Returns
236
+ -------
237
+ same shape as x
238
+ """
239
+ T = x.shape[-2]
240
+ if T > self.max_len:
241
+ raise ValueError(f"Sequence length {T} exceeds max_len={self.max_len}.")
242
+ return x + self.pe[:T]
243
+
244
+
245
+ # ============================================================
246
+ # LayerNorm
247
+ # ============================================================
248
+
249
+ class LayerNorm:
250
+ """
251
+ Layer Normalisation over the last dimension.
252
+
253
+ Parameters
254
+ ----------
255
+ d_model : int
256
+ eps : float
257
+ """
258
+
259
+ def __init__(self, d_model: int, eps: float = 1e-6) -> None:
260
+ self.gamma = np.ones(d_model)
261
+ self.beta = np.zeros(d_model)
262
+ self.eps = eps
263
+
264
+ def forward(self, x: np.ndarray) -> np.ndarray:
265
+ """x : (..., d_model)"""
266
+ mean = x.mean(axis=-1, keepdims=True)
267
+ var = x.var(axis=-1, keepdims=True)
268
+ x_hat = (x - mean) / np.sqrt(var + self.eps)
269
+ return self.gamma * x_hat + self.beta
270
+
271
+
272
+ # ============================================================
273
+ # Feed-Forward Network
274
+ # ============================================================
275
+
276
+ class FeedForward:
277
+ """
278
+ Position-wise feed-forward network: Linear → ReLU → Linear.
279
+
280
+ Parameters
281
+ ----------
282
+ d_model : int
283
+ d_ff : int
284
+ Hidden layer size (typically 4 × d_model).
285
+ random_state : int or None
286
+ """
287
+
288
+ def __init__(
289
+ self,
290
+ d_model: int,
291
+ d_ff: int = 256,
292
+ random_state: int | None = None,
293
+ ) -> None:
294
+ rng = np.random.default_rng(random_state)
295
+ self.W1 = rng.normal(0, np.sqrt(2.0 / d_model), (d_model, d_ff))
296
+ self.b1 = np.zeros(d_ff)
297
+ self.W2 = rng.normal(0, np.sqrt(2.0 / d_ff), (d_ff, d_model))
298
+ self.b2 = np.zeros(d_model)
299
+
300
+ def forward(self, x: np.ndarray) -> np.ndarray:
301
+ """x : (..., d_model) → (..., d_model)"""
302
+ h = _relu(x @ self.W1 + self.b1)
303
+ return h @ self.W2 + self.b2
304
+
305
+
306
+ # ============================================================
307
+ # Transformer Encoder Layer
308
+ # ============================================================
309
+
310
+ class TransformerEncoderLayer:
311
+ """
312
+ A single Transformer encoder layer:
313
+
314
+ x = LayerNorm(x + MultiHeadAttention(x))
315
+ x = LayerNorm(x + FeedForward(x))
316
+
317
+ Parameters
318
+ ----------
319
+ d_model : int
320
+ n_heads : int
321
+ d_ff : int
322
+ random_state : int or None
323
+ """
324
+
325
+ def __init__(
326
+ self,
327
+ d_model: int,
328
+ n_heads: int,
329
+ d_ff: int = 256,
330
+ random_state: int | None = None,
331
+ ) -> None:
332
+ self.attn = MultiHeadAttention(d_model, n_heads, random_state)
333
+ self.ffn = FeedForward(d_model, d_ff, random_state)
334
+ self.norm1 = LayerNorm(d_model)
335
+ self.norm2 = LayerNorm(d_model)
336
+
337
+ def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray:
338
+ """x : (B, T, d_model) → (B, T, d_model)"""
339
+ attn_out = self.attn.forward(x, mask)
340
+ x = self.norm1.forward(x + attn_out)
341
+
342
+ ffn_out = self.ffn.forward(x)
343
+ x = self.norm2.forward(x + ffn_out)
344
+ return x
345
+
346
+
347
+ # ============================================================
348
+ # Transformer Encoder (stack of layers)
349
+ # ============================================================
350
+
351
+ class TransformerEncoder:
352
+ """
353
+ Stack of TransformerEncoderLayer with input embedding + positional
354
+ encoding.
355
+
356
+ Parameters
357
+ ----------
358
+ vocab_size : int
359
+ Size of the input vocabulary (for the embedding lookup).
360
+ d_model : int
361
+ n_heads : int
362
+ n_layers : int
363
+ d_ff : int
364
+ max_len : int
365
+ random_state : int or None
366
+ """
367
+
368
+ def __init__(
369
+ self,
370
+ vocab_size: int,
371
+ d_model: int,
372
+ n_heads: int,
373
+ n_layers: int = 2,
374
+ d_ff: int = 256,
375
+ max_len: int = 512,
376
+ random_state: int | None = None,
377
+ ) -> None:
378
+ rng = np.random.default_rng(random_state)
379
+ self.d_model = d_model
380
+ self.embedding = rng.normal(0, 0.02, (vocab_size, d_model))
381
+ self.pos_enc = PositionalEncoding(d_model, max_len)
382
+
383
+ self.layers = [
384
+ TransformerEncoderLayer(d_model, n_heads, d_ff,
385
+ (random_state or 0) + i)
386
+ for i in range(n_layers)
387
+ ]
388
+
389
+ def forward(
390
+ self,
391
+ token_ids: np.ndarray,
392
+ mask: np.ndarray | None = None,
393
+ ) -> np.ndarray:
394
+ """
395
+ Parameters
396
+ ----------
397
+ token_ids : (B, T) integer token indices
398
+ mask : (B, 1, T, T) or None
399
+
400
+ Returns
401
+ -------
402
+ out : (B, T, d_model)
403
+ """
404
+ x = self.embedding[token_ids] # (B, T, d_model)
405
+ x = x * np.sqrt(self.d_model) # scale embeddings
406
+ x = self.pos_enc.forward(x)
407
+
408
+ for layer in self.layers:
409
+ x = layer.forward(x, mask)
410
+
411
+ return x
412
+
413
+ @staticmethod
414
+ def causal_mask(seq_len: int) -> np.ndarray:
415
+ """
416
+ Build a causal (look-ahead) mask of shape (1, 1, T, T)
417
+ where position i can attend to positions <= i.
418
+ """
419
+ mask = np.tril(np.ones((seq_len, seq_len)))
420
+ return mask[np.newaxis, np.newaxis, :, :]